nptl/allocatestack.c

   1 /* Copyright (C) 2002-2016 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, see
  17    <http://www.gnu.org/licenses/>.  */
  18
  19 #include <assert.h>
  20 #include <errno.h>
  21 #include <signal.h>
  22 #include <stdint.h>
  23 #include <string.h>
  24 #include <unistd.h>
  25 #include <sys/mman.h>
  26 #include <sys/param.h>
  27 #include <dl-sysdep.h>
  28 #include <dl-tls.h>
  29 #include <tls.h>
  30 #include <list.h>
  31 #include <lowlevellock.h>
  32 #include <futex-internal.h>
  33 #include <kernel-features.h>
  34 #include <stack-aliasing.h>
  35
  36
  37 #ifndef NEED_SEPARATE_REGISTER_STACK
  38
  39 /* Most architectures have exactly one stack pointer.  Some have more.  */
  40 # define STACK_VARIABLES void *stackaddr = NULL
  41
  42 /* How to pass the values to the 'create_thread' function.  */
  43 # define STACK_VARIABLES_ARGS stackaddr
  44
  45 /* How to declare function which gets there parameters.  */
  46 # define STACK_VARIABLES_PARMS void *stackaddr
  47
  48 /* How to declare allocate_stack.  */
  49 # define ALLOCATE_STACK_PARMS void **stack
  50
  51 /* This is how the function is called.  We do it this way to allow
  52    other variants of the function to have more parameters.  */
  53 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  54
  55 #else
  56
  57 /* We need two stacks.  The kernel will place them but we have to tell
  58    the kernel about the size of the reserved address space.  */
  59 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  60
  61 /* How to pass the values to the 'create_thread' function.  */
  62 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  63
  64 /* How to declare function which gets there parameters.  */
  65 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  66
  67 /* How to declare allocate_stack.  */
  68 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  69
  70 /* This is how the function is called.  We do it this way to allow
  71    other variants of the function to have more parameters.  */
  72 # define ALLOCATE_STACK(attr, pd) \
  73   allocate_stack (attr, pd, &stackaddr, &stacksize)
  74
  75 #endif
  76
  77
  78 /* Default alignment of stack.  */
  79 #ifndef STACK_ALIGN
  80 # define STACK_ALIGN __alignof__ (long double)
  81 #endif
  82
  83 /* Default value for minimal stack size after allocating thread
  84    descriptor and guard.  */
  85 #ifndef MINIMAL_REST_STACK
  86 # define MINIMAL_REST_STACK     4096
  87 #endif
  88
  89
  90 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  91    a stack.  Use it when possible.  */
  92 #ifndef MAP_STACK
  93 # define MAP_STACK 0
  94 #endif
  95
  96 /* This yields the pointer that TLS support code calls the thread pointer.  */
  97 #if TLS_TCB_AT_TP
  98 # define TLS_TPADJ(pd) (pd)
  99 #elif TLS_DTV_AT_TP
 100 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
 101 #endif
 102
 103 /* Cache handling for not-yet free stacks.  */
 104
 105 /* Maximum size in kB of cache.  */
 106 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 107 static size_t stack_cache_actsize;
 108
 109 /* Mutex protecting this variable.  */
 110 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 111
 112 /* List of queued stack frames.  */
 113 static LIST_HEAD (stack_cache);
 114
 115 /* List of the stacks in use.  */
 116 static LIST_HEAD (stack_used);
 117
 118 /* We need to record what list operations we are going to do so that,
 119    in case of an asynchronous interruption due to a fork() call, we
 120    can correct for the work.  */
 121 static uintptr_t in_flight_stack;
 122
 123 /* List of the threads with user provided stacks in use.  No need to
 124    initialize this, since it's done in __pthread_initialize_minimal.  */
 125 list_t __stack_user __attribute__ ((nocommon));
 126 hidden_data_def (__stack_user)
 127
 128 #if COLORING_INCREMENT != 0
 129 /* Number of threads created.  */
 130 static unsigned int nptl_ncreated;
 131 #endif
 132
 133
 134 /* Check whether the stack is still used or not.  */
 135 #define FREE_P(descr) ((descr)->tid <= 0)
 136
 137
 138 static void
 139 stack_list_del (list_t *elem)
 140 {
 141   in_flight_stack = (uintptr_t) elem;
 142
 143   atomic_write_barrier ();
 144
 145   list_del (elem);
 146
 147   atomic_write_barrier ();
 148
 149   in_flight_stack = 0;
 150 }
 151
 152
 153 static void
 154 stack_list_add (list_t *elem, list_t *list)
 155 {
 156   in_flight_stack = (uintptr_t) elem | 1;
 157
 158   atomic_write_barrier ();
 159
 160   list_add (elem, list);
 161
 162   atomic_write_barrier ();
 163
 164   in_flight_stack = 0;
 165 }
 166
 167
 168 /* We create a double linked list of all cache entries.  Double linked
 169    because this allows removing entries from the end.  */
 170
 171
 172 /* Get a stack frame from the cache.  We have to match by size since
 173    some blocks might be too small or far too large.  */
 174 static struct pthread *
 175 get_cached_stack (size_t *sizep, void **memp)
 176 {
 177   size_t size = *sizep;
 178   struct pthread *result = NULL;
 179   list_t *entry;
 180
 181   lll_lock (stack_cache_lock, LLL_PRIVATE);
 182
 183   /* Search the cache for a matching entry.  We search for the
 184      smallest stack which has at least the required size.  Note that
 185      in normal situations the size of all allocated stacks is the
 186      same.  As the very least there are only a few different sizes.
 187      Therefore this loop will exit early most of the time with an
 188      exact match.  */
 189   list_for_each (entry, &stack_cache)
 190     {
 191       struct pthread *curr;
 192
 193       curr = list_entry (entry, struct pthread, list);
 194       if (FREE_P (curr) && curr->stackblock_size >= size)
 195         {
 196           if (curr->stackblock_size == size)
 197             {
 198               result = curr;
 199               break;
 200             }
 201
 202           if (result == NULL
 203               || result->stackblock_size > curr->stackblock_size)
 204             result = curr;
 205         }
 206     }
 207
 208   if (__builtin_expect (result == NULL, 0)
 209       /* Make sure the size difference is not too excessive.  In that
 210          case we do not use the block.  */
 211       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 212     {
 213       /* Release the lock.  */
 214       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 215
 216       return NULL;
 217     }
 218
 219   /* Don't allow setxid until cloned.  */
 220   result->setxid_futex = -1;
 221
 222   /* Dequeue the entry.  */
 223   stack_list_del (&result->list);
 224
 225   /* And add to the list of stacks in use.  */
 226   stack_list_add (&result->list, &stack_used);
 227
 228   /* And decrease the cache size.  */
 229   stack_cache_actsize -= result->stackblock_size;
 230
 231   /* Release the lock early.  */
 232   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 233
 234   /* Report size and location of the stack to the caller.  */
 235   *sizep = result->stackblock_size;
 236   *memp = result->stackblock;
 237
 238   /* Cancellation handling is back to the default.  */
 239   result->cancelhandling = 0;
 240   result->cleanup = NULL;
 241
 242   /* No pending event.  */
 243   result->nextevent = NULL;
 244
 245   /* Clear the DTV.  */
 246   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 247   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
 248     if (! dtv[1 + cnt].pointer.is_static
 249         && dtv[1 + cnt].pointer.val != TLS_DTV_UNALLOCATED)
 250       free (dtv[1 + cnt].pointer.val);
 251   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 252
 253   /* Re-initialize the TLS.  */
 254   _dl_allocate_tls_init (TLS_TPADJ (result));
 255
 256   return result;
 257 }
 258
 259
 260 /* Free stacks until cache size is lower than LIMIT.  */
 261 void
 262 __free_stacks (size_t limit)
 263 {
 264   /* We reduce the size of the cache.  Remove the last entries until
 265      the size is below the limit.  */
 266   list_t *entry;
 267   list_t *prev;
 268
 269   /* Search from the end of the list.  */
 270   list_for_each_prev_safe (entry, prev, &stack_cache)
 271     {
 272       struct pthread *curr;
 273
 274       curr = list_entry (entry, struct pthread, list);
 275       if (FREE_P (curr))
 276         {
 277           /* Unlink the block.  */
 278           stack_list_del (entry);
 279
 280           /* Account for the freed memory.  */
 281           stack_cache_actsize -= curr->stackblock_size;
 282
 283           /* Free the memory associated with the ELF TLS.  */
 284           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 285
 286           /* Remove this block.  This should never fail.  If it does
 287              something is really wrong.  */
 288           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 289             abort ();
 290
 291           /* Maybe we have freed enough.  */
 292           if (stack_cache_actsize <= limit)
 293             break;
 294         }
 295     }
 296 }
 297
 298
 299 /* Add a stack frame which is not used anymore to the stack.  Must be
 300    called with the cache lock held.  */
 301 static inline void
 302 __attribute ((always_inline))
 303 queue_stack (struct pthread *stack)
 304 {
 305   /* We unconditionally add the stack to the list.  The memory may
 306      still be in use but it will not be reused until the kernel marks
 307      the stack as not used anymore.  */
 308   stack_list_add (&stack->list, &stack_cache);
 309
 310   stack_cache_actsize += stack->stackblock_size;
 311   if (__glibc_unlikely (stack_cache_actsize > stack_cache_maxsize))
 312     __free_stacks (stack_cache_maxsize);
 313 }
 314
 315
 316 static int
 317 internal_function
 318 change_stack_perm (struct pthread *pd
 319 #ifdef NEED_SEPARATE_REGISTER_STACK
 320                    , size_t pagemask
 321 #endif
 322                    )
 323 {
 324 #ifdef NEED_SEPARATE_REGISTER_STACK
 325   void *stack = (pd->stackblock
 326                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 327                       & pagemask) + pd->guardsize) & pagemask));
 328   size_t len = pd->stackblock + pd->stackblock_size - stack;
 329 #elif _STACK_GROWS_DOWN
 330   void *stack = pd->stackblock + pd->guardsize;
 331   size_t len = pd->stackblock_size - pd->guardsize;
 332 #elif _STACK_GROWS_UP
 333   void *stack = pd->stackblock;
 334   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 335 #else
 336 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 337 #endif
 338   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 339     return errno;
 340
 341   return 0;
 342 }
 343
 344
 345 /* Returns a usable stack for a new thread either by allocating a
 346    new stack or reusing a cached stack of sufficient size.
 347    ATTR must be non-NULL and point to a valid pthread_attr.
 348    PDP must be non-NULL.  */
 349 static int
 350 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 351                 ALLOCATE_STACK_PARMS)
 352 {
 353   struct pthread *pd;
 354   size_t size;
 355   size_t pagesize_m1 = __getpagesize () - 1;
 356
 357   assert (powerof2 (pagesize_m1 + 1));
 358   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 359
 360   /* Get the stack size from the attribute if it is set.  Otherwise we
 361      use the default we determined at start time.  */
 362   if (attr->stacksize != 0)
 363     size = attr->stacksize;
 364   else
 365     {
 366       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
 367       size = __default_pthread_attr.stacksize;
 368       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
 369     }
 370
 371   /* Get memory for the stack.  */
 372   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
 373     {
 374       uintptr_t adj;
 375       char *stackaddr = (char *) attr->stackaddr;
 376
 377       /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
 378          pthread at the top of the stack block.  Later we adjust the guard
 379          location and stack address to match the _STACK_GROWS_UP case.  */
 380       if (_STACK_GROWS_UP)
 381         stackaddr += attr->stacksize;
 382
 383       /* If the user also specified the size of the stack make sure it
 384          is large enough.  */
 385       if (attr->stacksize != 0
 386           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 387         return EINVAL;
 388
 389       /* Adjust stack size for alignment of the TLS block.  */
 390 #if TLS_TCB_AT_TP
 391       adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
 392             & __static_tls_align_m1;
 393       assert (size > adj + TLS_TCB_SIZE);
 394 #elif TLS_DTV_AT_TP
 395       adj = ((uintptr_t) stackaddr - __static_tls_size)
 396             & __static_tls_align_m1;
 397       assert (size > adj);
 398 #endif
 399
 400       /* The user provided some memory.  Let's hope it matches the
 401          size...  We do not allocate guard pages if the user provided
 402          the stack.  It is the user's responsibility to do this if it
 403          is wanted.  */
 404 #if TLS_TCB_AT_TP
 405       pd = (struct pthread *) ((uintptr_t) stackaddr
 406                                - TLS_TCB_SIZE - adj);
 407 #elif TLS_DTV_AT_TP
 408       pd = (struct pthread *) (((uintptr_t) stackaddr
 409                                 - __static_tls_size - adj)
 410                                - TLS_PRE_TCB_SIZE);
 411 #endif
 412
 413       /* The user provided stack memory needs to be cleared.  */
 414       memset (pd, '\0', sizeof (struct pthread));
 415
 416       /* The first TSD block is included in the TCB.  */
 417       pd->specific[0] = pd->specific_1stblock;
 418
 419       /* Remember the stack-related values.  */
 420       pd->stackblock = (char *) stackaddr - size;
 421       pd->stackblock_size = size;
 422
 423       /* This is a user-provided stack.  It will not be queued in the
 424          stack cache nor will the memory (except the TLS memory) be freed.  */
 425       pd->user_stack = true;
 426
 427       /* This is at least the second thread.  */
 428       pd->header.multiple_threads = 1;
 429 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 430       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 431 #endif
 432
 433 #ifndef __ASSUME_PRIVATE_FUTEX
 434       /* The thread must know when private futexes are supported.  */
 435       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 436                                                 header.private_futex);
 437 #endif
 438
 439 #ifdef NEED_DL_SYSINFO
 440       SETUP_THREAD_SYSINFO (pd);
 441 #endif
 442
 443       /* The process ID is also the same as that of the caller.  */
 444       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 445
 446       /* Don't allow setxid until cloned.  */
 447       pd->setxid_futex = -1;
 448
 449       /* Allocate the DTV for this thread.  */
 450       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 451         {
 452           /* Something went wrong.  */
 453           assert (errno == ENOMEM);
 454           return errno;
 455         }
 456
 457
 458       /* Prepare to modify global data.  */
 459       lll_lock (stack_cache_lock, LLL_PRIVATE);
 460
 461       /* And add to the list of stacks in use.  */
 462       list_add (&pd->list, &__stack_user);
 463
 464       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 465     }
 466   else
 467     {
 468       /* Allocate some anonymous memory.  If possible use the cache.  */
 469       size_t guardsize;
 470       size_t reqsize;
 471       void *mem;
 472       const int prot = (PROT_READ | PROT_WRITE
 473                         | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 474
 475 #if COLORING_INCREMENT != 0
 476       /* Add one more page for stack coloring.  Don't do it for stacks
 477          with 16 times pagesize or larger.  This might just cause
 478          unnecessary misalignment.  */
 479       if (size <= 16 * pagesize_m1)
 480         size += pagesize_m1 + 1;
 481 #endif
 482
 483       /* Adjust the stack size for alignment.  */
 484       size &= ~__static_tls_align_m1;
 485       assert (size != 0);
 486
 487       /* Make sure the size of the stack is enough for the guard and
 488          eventually the thread descriptor.  */
 489       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 490       if (__builtin_expect (size < ((guardsize + __static_tls_size
 491                                      + MINIMAL_REST_STACK + pagesize_m1)
 492                                     & ~pagesize_m1),
 493                             0))
 494         /* The stack is too small (or the guard too large).  */
 495         return EINVAL;
 496
 497       /* Try to get a stack from the cache.  */
 498       reqsize = size;
 499       pd = get_cached_stack (&size, &mem);
 500       if (pd == NULL)
 501         {
 502           /* To avoid aliasing effects on a larger scale than pages we
 503              adjust the allocated stack size if necessary.  This way
 504              allocations directly following each other will not have
 505              aliasing problems.  */
 506 #if MULTI_PAGE_ALIASING != 0
 507           if ((size % MULTI_PAGE_ALIASING) == 0)
 508             size += pagesize_m1 + 1;
 509 #endif
 510
 511           mem = mmap (NULL, size, prot,
 512                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 513
 514           if (__glibc_unlikely (mem == MAP_FAILED))
 515             return errno;
 516
 517           /* SIZE is guaranteed to be greater than zero.
 518              So we can never get a null pointer back from mmap.  */
 519           assert (mem != NULL);
 520
 521 #if COLORING_INCREMENT != 0
 522           /* Atomically increment NCREATED.  */
 523           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 524
 525           /* We chose the offset for coloring by incrementing it for
 526              every new thread by a fixed amount.  The offset used
 527              module the page size.  Even if coloring would be better
 528              relative to higher alignment values it makes no sense to
 529              do it since the mmap() interface does not allow us to
 530              specify any alignment for the returned memory block.  */
 531           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 532
 533           /* Make sure the coloring offsets does not disturb the alignment
 534              of the TCB and static TLS block.  */
 535           if (__glibc_unlikely ((coloring & __static_tls_align_m1) != 0))
 536             coloring = (((coloring + __static_tls_align_m1)
 537                          & ~(__static_tls_align_m1))
 538                         & ~pagesize_m1);
 539 #else
 540           /* Unless specified we do not make any adjustments.  */
 541 # define coloring 0
 542 #endif
 543
 544           /* Place the thread descriptor at the end of the stack.  */
 545 #if TLS_TCB_AT_TP
 546           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 547 #elif TLS_DTV_AT_TP
 548           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 549                                     - __static_tls_size)
 550                                     & ~__static_tls_align_m1)
 551                                    - TLS_PRE_TCB_SIZE);
 552 #endif
 553
 554           /* Remember the stack-related values.  */
 555           pd->stackblock = mem;
 556           pd->stackblock_size = size;
 557
 558           /* We allocated the first block thread-specific data array.
 559              This address will not change for the lifetime of this
 560              descriptor.  */
 561           pd->specific[0] = pd->specific_1stblock;
 562
 563           /* This is at least the second thread.  */
 564           pd->header.multiple_threads = 1;
 565 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 566           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 567 #endif
 568
 569 #ifndef __ASSUME_PRIVATE_FUTEX
 570           /* The thread must know when private futexes are supported.  */
 571           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 572                                                     header.private_futex);
 573 #endif
 574
 575 #ifdef NEED_DL_SYSINFO
 576           SETUP_THREAD_SYSINFO (pd);
 577 #endif
 578
 579           /* Don't allow setxid until cloned.  */
 580           pd->setxid_futex = -1;
 581
 582           /* The process ID is also the same as that of the caller.  */
 583           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 584
 585           /* Allocate the DTV for this thread.  */
 586           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 587             {
 588               /* Something went wrong.  */
 589               assert (errno == ENOMEM);
 590
 591               /* Free the stack memory we just allocated.  */
 592               (void) munmap (mem, size);
 593
 594               return errno;
 595             }
 596
 597
 598           /* Prepare to modify global data.  */
 599           lll_lock (stack_cache_lock, LLL_PRIVATE);
 600
 601           /* And add to the list of stacks in use.  */
 602           stack_list_add (&pd->list, &stack_used);
 603
 604           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 605
 606
 607           /* There might have been a race.  Another thread might have
 608              caused the stacks to get exec permission while this new
 609              stack was prepared.  Detect if this was possible and
 610              change the permission if necessary.  */
 611           if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
 612                                 && (prot & PROT_EXEC) == 0, 0))
 613             {
 614               int err = change_stack_perm (pd
 615 #ifdef NEED_SEPARATE_REGISTER_STACK
 616                                            , ~pagesize_m1
 617 #endif
 618                                            );
 619               if (err != 0)
 620                 {
 621                   /* Free the stack memory we just allocated.  */
 622                   (void) munmap (mem, size);
 623
 624                   return err;
 625                 }
 626             }
 627
 628
 629           /* Note that all of the stack and the thread descriptor is
 630              zeroed.  This means we do not have to initialize fields
 631              with initial value zero.  This is specifically true for
 632              the 'tid' field which is always set back to zero once the
 633              stack is not used anymore and for the 'guardsize' field
 634              which will be read next.  */
 635         }
 636
 637       /* Create or resize the guard area if necessary.  */
 638       if (__glibc_unlikely (guardsize > pd->guardsize))
 639         {
 640 #ifdef NEED_SEPARATE_REGISTER_STACK
 641           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 642 #elif _STACK_GROWS_DOWN
 643           char *guard = mem;
 644 #elif _STACK_GROWS_UP
 645           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 646 #endif
 647           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 648             {
 649             mprot_error:
 650               lll_lock (stack_cache_lock, LLL_PRIVATE);
 651
 652               /* Remove the thread from the list.  */
 653               stack_list_del (&pd->list);
 654
 655               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 656
 657               /* Get rid of the TLS block we allocated.  */
 658               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 659
 660               /* Free the stack memory regardless of whether the size
 661                  of the cache is over the limit or not.  If this piece
 662                  of memory caused problems we better do not use it
 663                  anymore.  Uh, and we ignore possible errors.  There
 664                  is nothing we could do.  */
 665               (void) munmap (mem, size);
 666
 667               return errno;
 668             }
 669
 670           pd->guardsize = guardsize;
 671         }
 672       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 673                                  0))
 674         {
 675           /* The old guard area is too large.  */
 676
 677 #ifdef NEED_SEPARATE_REGISTER_STACK
 678           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 679           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 680
 681           if (oldguard < guard
 682               && mprotect (oldguard, guard - oldguard, prot) != 0)
 683             goto mprot_error;
 684
 685           if (mprotect (guard + guardsize,
 686                         oldguard + pd->guardsize - guard - guardsize,
 687                         prot) != 0)
 688             goto mprot_error;
 689 #elif _STACK_GROWS_DOWN
 690           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 691                         prot) != 0)
 692             goto mprot_error;
 693 #elif _STACK_GROWS_UP
 694           if (mprotect ((char *) pd - pd->guardsize,
 695                         pd->guardsize - guardsize, prot) != 0)
 696             goto mprot_error;
 697 #endif
 698
 699           pd->guardsize = guardsize;
 700         }
 701       /* The pthread_getattr_np() calls need to get passed the size
 702          requested in the attribute, regardless of how large the
 703          actually used guardsize is.  */
 704       pd->reported_guardsize = guardsize;
 705     }
 706
 707   /* Initialize the lock.  We have to do this unconditionally since the
 708      stillborn thread could be canceled while the lock is taken.  */
 709   pd->lock = LLL_LOCK_INITIALIZER;
 710
 711   /* The robust mutex lists also need to be initialized
 712      unconditionally because the cleanup for the previous stack owner
 713      might have happened in the kernel.  */
 714   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 715                                   - offsetof (pthread_mutex_t,
 716                                               __data.__list.__next));
 717   pd->robust_head.list_op_pending = NULL;
 718 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 719   pd->robust_prev = &pd->robust_head;
 720 #endif
 721   pd->robust_head.list = &pd->robust_head;
 722
 723   /* We place the thread descriptor at the end of the stack.  */
 724   *pdp = pd;
 725
 726 #if _STACK_GROWS_DOWN
 727   void *stacktop;
 728
 729 # if TLS_TCB_AT_TP
 730   /* The stack begins before the TCB and the static TLS block.  */
 731   stacktop = ((char *) (pd + 1) - __static_tls_size);
 732 # elif TLS_DTV_AT_TP
 733   stacktop = (char *) (pd - 1);
 734 # endif
 735
 736 # ifdef NEED_SEPARATE_REGISTER_STACK
 737   *stack = pd->stackblock;
 738   *stacksize = stacktop - *stack;
 739 # else
 740   *stack = stacktop;
 741 # endif
 742 #else
 743   *stack = pd->stackblock;
 744 #endif
 745
 746   return 0;
 747 }
 748
 749
 750 void
 751 internal_function
 752 __deallocate_stack (struct pthread *pd)
 753 {
 754   lll_lock (stack_cache_lock, LLL_PRIVATE);
 755
 756   /* Remove the thread from the list of threads with user defined
 757      stacks.  */
 758   stack_list_del (&pd->list);
 759
 760   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 761      not reset the 'used' flag in the 'tid' field.  This is done by
 762      the kernel.  If no thread has been created yet this field is
 763      still zero.  */
 764   if (__glibc_likely (! pd->user_stack))
 765     (void) queue_stack (pd);
 766   else
 767     /* Free the memory associated with the ELF TLS.  */
 768     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 769
 770   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 771 }
 772
 773
 774 int
 775 internal_function
 776 __make_stacks_executable (void **stack_endp)
 777 {
 778   /* First the main thread's stack.  */
 779   int err = _dl_make_stack_executable (stack_endp);
 780   if (err != 0)
 781     return err;
 782
 783 #ifdef NEED_SEPARATE_REGISTER_STACK
 784   const size_t pagemask = ~(__getpagesize () - 1);
 785 #endif
 786
 787   lll_lock (stack_cache_lock, LLL_PRIVATE);
 788
 789   list_t *runp;
 790   list_for_each (runp, &stack_used)
 791     {
 792       err = change_stack_perm (list_entry (runp, struct pthread, list)
 793 #ifdef NEED_SEPARATE_REGISTER_STACK
 794                                , pagemask
 795 #endif
 796                                );
 797       if (err != 0)
 798         break;
 799     }
 800
 801   /* Also change the permission for the currently unused stacks.  This
 802      might be wasted time but better spend it here than adding a check
 803      in the fast path.  */
 804   if (err == 0)
 805     list_for_each (runp, &stack_cache)
 806       {
 807         err = change_stack_perm (list_entry (runp, struct pthread, list)
 808 #ifdef NEED_SEPARATE_REGISTER_STACK
 809                                  , pagemask
 810 #endif
 811                                  );
 812         if (err != 0)
 813           break;
 814       }
 815
 816   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 817
 818   return err;
 819 }
 820
 821
 822 /* In case of a fork() call the memory allocation in the child will be
 823    the same but only one thread is running.  All stacks except that of
 824    the one running thread are not used anymore.  We have to recycle
 825    them.  */
 826 void
 827 __reclaim_stacks (void)
 828 {
 829   struct pthread *self = (struct pthread *) THREAD_SELF;
 830
 831   /* No locking necessary.  The caller is the only stack in use.  But
 832      we have to be aware that we might have interrupted a list
 833      operation.  */
 834
 835   if (in_flight_stack != 0)
 836     {
 837       bool add_p = in_flight_stack & 1;
 838       list_t *elem = (list_t *) (in_flight_stack & ~(uintptr_t) 1);
 839
 840       if (add_p)
 841         {
 842           /* We always add at the beginning of the list.  So in this case we
 843              only need to check the beginning of these lists to see if the
 844              pointers at the head of the list are inconsistent.  */
 845           list_t *l = NULL;
 846
 847           if (stack_used.next->prev != &stack_used)
 848             l = &stack_used;
 849           else if (stack_cache.next->prev != &stack_cache)
 850             l = &stack_cache;
 851
 852           if (l != NULL)
 853             {
 854               assert (l->next->prev == elem);
 855               elem->next = l->next;
 856               elem->prev = l;
 857               l->next = elem;
 858             }
 859         }
 860       else
 861         {
 862           /* We can simply always replay the delete operation.  */
 863           elem->next->prev = elem->prev;
 864           elem->prev->next = elem->next;
 865         }
 866     }
 867
 868   /* Mark all stacks except the still running one as free.  */
 869   list_t *runp;
 870   list_for_each (runp, &stack_used)
 871     {
 872       struct pthread *curp = list_entry (runp, struct pthread, list);
 873       if (curp != self)
 874         {
 875           /* This marks the stack as free.  */
 876           curp->tid = 0;
 877
 878           /* The PID field must be initialized for the new process.  */
 879           curp->pid = self->pid;
 880
 881           /* Account for the size of the stack.  */
 882           stack_cache_actsize += curp->stackblock_size;
 883
 884           if (curp->specific_used)
 885             {
 886               /* Clear the thread-specific data.  */
 887               memset (curp->specific_1stblock, '\0',
 888                       sizeof (curp->specific_1stblock));
 889
 890               curp->specific_used = false;
 891
 892               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 893                 if (curp->specific[cnt] != NULL)
 894                   {
 895                     memset (curp->specific[cnt], '\0',
 896                             sizeof (curp->specific_1stblock));
 897
 898                     /* We have allocated the block which we do not
 899                        free here so re-set the bit.  */
 900                     curp->specific_used = true;
 901                   }
 902             }
 903         }
 904     }
 905
 906   /* Reset the PIDs in any cached stacks.  */
 907   list_for_each (runp, &stack_cache)
 908     {
 909       struct pthread *curp = list_entry (runp, struct pthread, list);
 910       curp->pid = self->pid;
 911     }
 912
 913   /* Add the stack of all running threads to the cache.  */
 914   list_splice (&stack_used, &stack_cache);
 915
 916   /* Remove the entry for the current thread to from the cache list
 917      and add it to the list of running threads.  Which of the two
 918      lists is decided by the user_stack flag.  */
 919   stack_list_del (&self->list);
 920
 921   /* Re-initialize the lists for all the threads.  */
 922   INIT_LIST_HEAD (&stack_used);
 923   INIT_LIST_HEAD (&__stack_user);
 924
 925   if (__glibc_unlikely (THREAD_GETMEM (self, user_stack)))
 926     list_add (&self->list, &__stack_user);
 927   else
 928     list_add (&self->list, &stack_used);
 929
 930   /* There is one thread running.  */
 931   __nptl_nthreads = 1;
 932
 933   in_flight_stack = 0;
 934
 935   /* Initialize locks.  */
 936   stack_cache_lock = LLL_LOCK_INITIALIZER;
 937   __default_pthread_attr_lock = LLL_LOCK_INITIALIZER;
 938 }
 939
 940
 941 #if HP_TIMING_AVAIL
 942 # undef __find_thread_by_id
 943 /* Find a thread given the thread ID.  */
 944 attribute_hidden
 945 struct pthread *
 946 __find_thread_by_id (pid_t tid)
 947 {
 948   struct pthread *result = NULL;
 949
 950   lll_lock (stack_cache_lock, LLL_PRIVATE);
 951
 952   /* Iterate over the list with system-allocated threads first.  */
 953   list_t *runp;
 954   list_for_each (runp, &stack_used)
 955     {
 956       struct pthread *curp;
 957
 958       curp = list_entry (runp, struct pthread, list);
 959
 960       if (curp->tid == tid)
 961         {
 962           result = curp;
 963           goto out;
 964         }
 965     }
 966
 967   /* Now the list with threads using user-allocated stacks.  */
 968   list_for_each (runp, &__stack_user)
 969     {
 970       struct pthread *curp;
 971
 972       curp = list_entry (runp, struct pthread, list);
 973
 974       if (curp->tid == tid)
 975         {
 976           result = curp;
 977           goto out;
 978         }
 979     }
 980
 981  out:
 982   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 983
 984   return result;
 985 }
 986 #endif
 987
 988
 989 #ifdef SIGSETXID
 990 static void
 991 internal_function
 992 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 993 {
 994   int ch;
 995
 996   /* Wait until this thread is cloned.  */
 997   if (t->setxid_futex == -1
 998       && ! atomic_compare_and_exchange_bool_acq (&t->setxid_futex, -2, -1))
 999     do
1000       futex_wait_simple (&t->setxid_futex, -2, FUTEX_PRIVATE);
1001     while (t->setxid_futex == -2);
1002
1003   /* Don't let the thread exit before the setxid handler runs.  */
1004   t->setxid_futex = 0;
1005
1006   do
1007     {
1008       ch = t->cancelhandling;
1009
1010       /* If the thread is exiting right now, ignore it.  */
1011       if ((ch & EXITING_BITMASK) != 0)
1012         {
1013           /* Release the futex if there is no other setxid in
1014              progress.  */
1015           if ((ch & SETXID_BITMASK) == 0)
1016             {
1017               t->setxid_futex = 1;
1018               futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1019             }
1020           return;
1021         }
1022     }
1023   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1024                                                ch | SETXID_BITMASK, ch));
1025 }
1026
1027
1028 static void
1029 internal_function
1030 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
1031 {
1032   int ch;
1033
1034   do
1035     {
1036       ch = t->cancelhandling;
1037       if ((ch & SETXID_BITMASK) == 0)
1038         return;
1039     }
1040   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
1041                                                ch & ~SETXID_BITMASK, ch));
1042
1043   /* Release the futex just in case.  */
1044   t->setxid_futex = 1;
1045   futex_wake (&t->setxid_futex, 1, FUTEX_PRIVATE);
1046 }
1047
1048
1049 static int
1050 internal_function
1051 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
1052 {
1053   if ((t->cancelhandling & SETXID_BITMASK) == 0)
1054     return 0;
1055
1056   int val;
1057   INTERNAL_SYSCALL_DECL (err);
1058   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1059                           t->tid, SIGSETXID);
1060
1061   /* If this failed, it must have had not started yet or else exited.  */
1062   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1063     {
1064       atomic_increment (&cmdp->cntr);
1065       return 1;
1066     }
1067   else
1068     return 0;
1069 }
1070
1071 /* Check for consistency across set*id system call results.  The abort
1072    should not happen as long as all privileges changes happen through
1073    the glibc wrappers.  ERROR must be 0 (no error) or an errno
1074    code.  */
1075 void
1076 attribute_hidden
1077 __nptl_setxid_error (struct xid_command *cmdp, int error)
1078 {
1079   do
1080     {
1081       int olderror = cmdp->error;
1082       if (olderror == error)
1083         break;
1084       if (olderror != -1)
1085         /* Mismatch between current and previous results.  */
1086         abort ();
1087     }
1088   while (atomic_compare_and_exchange_bool_acq (&cmdp->error, error, -1));
1089 }
1090
1091 int
1092 attribute_hidden
1093 __nptl_setxid (struct xid_command *cmdp)
1094 {
1095   int signalled;
1096   int result;
1097   lll_lock (stack_cache_lock, LLL_PRIVATE);
1098
1099   __xidcmd = cmdp;
1100   cmdp->cntr = 0;
1101   cmdp->error = -1;
1102
1103   struct pthread *self = THREAD_SELF;
1104
1105   /* Iterate over the list with system-allocated threads first.  */
1106   list_t *runp;
1107   list_for_each (runp, &stack_used)
1108     {
1109       struct pthread *t = list_entry (runp, struct pthread, list);
1110       if (t == self)
1111         continue;
1112
1113       setxid_mark_thread (cmdp, t);
1114     }
1115
1116   /* Now the list with threads using user-allocated stacks.  */
1117   list_for_each (runp, &__stack_user)
1118     {
1119       struct pthread *t = list_entry (runp, struct pthread, list);
1120       if (t == self)
1121         continue;
1122
1123       setxid_mark_thread (cmdp, t);
1124     }
1125
1126   /* Iterate until we don't succeed in signalling anyone.  That means
1127      we have gotten all running threads, and their children will be
1128      automatically correct once started.  */
1129   do
1130     {
1131       signalled = 0;
1132
1133       list_for_each (runp, &stack_used)
1134         {
1135           struct pthread *t = list_entry (runp, struct pthread, list);
1136           if (t == self)
1137             continue;
1138
1139           signalled += setxid_signal_thread (cmdp, t);
1140         }
1141
1142       list_for_each (runp, &__stack_user)
1143         {
1144           struct pthread *t = list_entry (runp, struct pthread, list);
1145           if (t == self)
1146             continue;
1147
1148           signalled += setxid_signal_thread (cmdp, t);
1149         }
1150
1151       int cur = cmdp->cntr;
1152       while (cur != 0)
1153         {
1154           futex_wait_simple ((unsigned int *) &cmdp->cntr, cur,
1155                              FUTEX_PRIVATE);
1156           cur = cmdp->cntr;
1157         }
1158     }
1159   while (signalled != 0);
1160
1161   /* Clean up flags, so that no thread blocks during exit waiting
1162      for a signal which will never come.  */
1163   list_for_each (runp, &stack_used)
1164     {
1165       struct pthread *t = list_entry (runp, struct pthread, list);
1166       if (t == self)
1167         continue;
1168
1169       setxid_unmark_thread (cmdp, t);
1170     }
1171
1172   list_for_each (runp, &__stack_user)
1173     {
1174       struct pthread *t = list_entry (runp, struct pthread, list);
1175       if (t == self)
1176         continue;
1177
1178       setxid_unmark_thread (cmdp, t);
1179     }
1180
1181   /* This must be last, otherwise the current thread might not have
1182      permissions to send SIGSETXID syscall to the other threads.  */
1183   INTERNAL_SYSCALL_DECL (err);
1184   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1185                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1186   int error = 0;
1187   if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (result, err)))
1188     {
1189       error = INTERNAL_SYSCALL_ERRNO (result, err);
1190       __set_errno (error);
1191       result = -1;
1192     }
1193   __nptl_setxid_error (cmdp, error);
1194
1195   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1196   return result;
1197 }
1198 #endif  /* SIGSETXID.  */
1199
1200
1201 static inline void __attribute__((always_inline))
1202 init_one_static_tls (struct pthread *curp, struct link_map *map)
1203 {
1204 # if TLS_TCB_AT_TP
1205   void *dest = (char *) curp - map->l_tls_offset;
1206 # elif TLS_DTV_AT_TP
1207   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1208 # else
1209 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1210 # endif
1211
1212   /* We cannot delay the initialization of the Static TLS area, since
1213      it can be accessed with LE or IE, but since the DTV is only used
1214      by GD and LD, we can delay its update to avoid a race.  */
1215   memset (__mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1216           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1217 }
1218
1219 void
1220 attribute_hidden
1221 __pthread_init_static_tls (struct link_map *map)
1222 {
1223   lll_lock (stack_cache_lock, LLL_PRIVATE);
1224
1225   /* Iterate over the list with system-allocated threads first.  */
1226   list_t *runp;
1227   list_for_each (runp, &stack_used)
1228     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1229
1230   /* Now the list with threads using user-allocated stacks.  */
1231   list_for_each (runp, &__stack_user)
1232     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1233
1234   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1235 }
1236
1237
1238 void
1239 attribute_hidden
1240 __wait_lookup_done (void)
1241 {
1242   lll_lock (stack_cache_lock, LLL_PRIVATE);
1243
1244   struct pthread *self = THREAD_SELF;
1245
1246   /* Iterate over the list with system-allocated threads first.  */
1247   list_t *runp;
1248   list_for_each (runp, &stack_used)
1249     {
1250       struct pthread *t = list_entry (runp, struct pthread, list);
1251       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1252         continue;
1253
1254       int *const gscope_flagp = &t->header.gscope_flag;
1255
1256       /* We have to wait until this thread is done with the global
1257          scope.  First tell the thread that we are waiting and
1258          possibly have to be woken.  */
1259       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1260                                                 THREAD_GSCOPE_FLAG_WAIT,
1261                                                 THREAD_GSCOPE_FLAG_USED))
1262         continue;
1263
1264       do
1265         futex_wait_simple ((unsigned int *) gscope_flagp,
1266                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1267       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1268     }
1269
1270   /* Now the list with threads using user-allocated stacks.  */
1271   list_for_each (runp, &__stack_user)
1272     {
1273       struct pthread *t = list_entry (runp, struct pthread, list);
1274       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1275         continue;
1276
1277       int *const gscope_flagp = &t->header.gscope_flag;
1278
1279       /* We have to wait until this thread is done with the global
1280          scope.  First tell the thread that we are waiting and
1281          possibly have to be woken.  */
1282       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1283                                                 THREAD_GSCOPE_FLAG_WAIT,
1284                                                 THREAD_GSCOPE_FLAG_USED))
1285         continue;
1286
1287       do
1288         futex_wait_simple ((unsigned int *) gscope_flagp,
1289                            THREAD_GSCOPE_FLAG_WAIT, FUTEX_PRIVATE);
1290       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1291     }
1292
1293   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1294 }