Add per-thread cache to malloc

author DJ Delorie <dj@delorie.com>

Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)

committer DJ Delorie <dj@delorie.com>

Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)
author DJ Delorie <dj@delorie.com>
Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)
committer DJ Delorie <dj@delorie.com>
Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)
diff --git a/ChangeLog b/ChangeLog

index ac598984e368043b45cce9a098accadd37e7792a..1c51b3be297bf0ee8304d465dcd3a80e5922132f 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,31 @@
+2017-07-06  DJ Delorie  <dj@delorie.com>
+
+       * config.make.in: Enable experimental malloc option.
+       * configure.ac: Likewise.
+       * configure: Regenerate.
+       * manual/install.texi: Document it.
+       * INSTALL: Regenerate.
+       * malloc/Makefile: Likewise.
+       * malloc/malloc.c: Add per-thread cache (tcache).
+       (tcache_put): New.
+       (tcache_get): New.
+       (tcache_thread_freeres): New.
+       (tcache_init): New.
+       (__libc_malloc): Use cached chunks if available.
+       (__libc_free): Initialize tcache if needed.
+       (__libc_realloc): Likewise.
+       (__libc_calloc): Likewise.
+       (_int_malloc): Prefill tcache when appropriate.
+       (_int_free): Likewise.
+       (do_set_tcache_max): New.
+       (do_set_tcache_count): New.
+       (do_set_tcache_unsorted_limit): New.
+       * manual/probes.texi: Document new probes.
+       * malloc/arena.c: Add new tcache tunables.
+       * elf/dl-tunables.list: Likewise.
+       * manual/tunables.texi: Document them.
+       * NEWS: Mention the per-thread cache.
+
  2017-07-06  Joseph Myers  <joseph@codesourcery.com>
  
         * iconvdata/tst-loading.c (TIMEOUT): Define to 30.
diff --git a/INSTALL b/INSTALL

index 5c745244a6f8cb618878480fd571bbd152c395e0..0ff87ed60325bccde491765fc333ef4112a05b31 100644 (file)
--- a/INSTALL
+++ b/INSTALL
@@ -200,6 +200,12 @@ will be used, and CFLAGS sets optimization options for the compiler.
       libnss_nisplus are not built at all.  Use this option to enable
       libnsl with all depending NSS modules and header files.
  
+'--disable-experimental-malloc'
+     By default, a per-thread cache is enabled in 'malloc'.  While this
+     cache can be disabled on a per-application basis using tunables
+     (set glibc.malloc.tcache_count to zero), this option can be used to
+     remove it from the build completely.
+
  '--build=BUILD-SYSTEM'
  '--host=HOST-SYSTEM'
       These options are for cross-compiling.  If you specify both options
diff --git a/NEWS b/NEWS

index 616b3244e7983984d631acf7ba65717d483ff2ce..1266276fcd50acd95175a7355a07a7252e150265 100644 (file)
--- a/NEWS
+++ b/NEWS
@@ -9,6 +9,14 @@ Version 2.26
  
  Major new features:
  
+* A per-thread cache has been added to malloc. Access to the cache requires
+  no locks and therefore significantly accelerates the fast path to allocate
+  and free small amounts of memory. Refilling an empty cache requires locking
+  the underlying arena. Performance measurements show significant gains in a
+  wide variety of user workloads. Workloads were captured using a special
+  instrumented malloc and analyzed with a malloc simulator. Contributed by
+  DJ Delorie with the help of Florian Weimer, and Carlos O'Donell.
+
  * Unicode 10.0.0 Support: Character encoding, character type info, and
    transliteration tables are all updated to Unicode 10.0.0, using
    generator scripts contributed by Mike FABIAN (Red Hat).
diff --git a/config.make.in b/config.make.in

index dadabf9b6af53234b818f8d3bfbe28f64cb030fe..5a4a054612acd2272bfdb34309345855b3a15a09 100644 (file)
--- a/config.make.in
+++ b/config.make.in
@@ -78,6 +78,8 @@ multi-arch = @multi_arch@
  
  mach-interface-list = @mach_interface_list@
  
+experimental-malloc = @experimental_malloc@
+
  nss-crypt = @libc_cv_nss_crypt@
  static-nss-crypt = @libc_cv_static_nss_crypt@
  
diff --git a/configure b/configure

index 8e5c5bdde040b1c2a29f5831509ef72ba69acf0a..d8e1c50e11183975d123ac5a9ac28ddd2659b934 100755 (executable)
--- a/configure
+++ b/configure
@@ -674,6 +674,7 @@ build_obsolete_nsl
  link_obsolete_rpc
  libc_cv_static_nss_crypt
  libc_cv_nss_crypt
+experimental_malloc
  enable_werror
  all_warnings
  force_install
@@ -779,6 +780,7 @@ enable_kernel
  enable_all_warnings
  enable_werror
  enable_multi_arch
+enable_experimental_malloc
  enable_nss_crypt
  enable_obsolete_rpc
  enable_obsolete_nsl
@@ -1450,6 +1452,8 @@ Optional Features:
    --disable-werror        do not build with -Werror
    --enable-multi-arch     enable single DSO with optimizations for multiple
                            architectures
+  --disable-experimental-malloc
+                          disable experimental malloc features
    --enable-nss-crypt      enable libcrypt to use nss
    --enable-obsolete-rpc   build and install the obsolete RPC code for
                            link-time usage
@@ -3522,6 +3526,15 @@ else
  fi
  
  
+# Check whether --enable-experimental-malloc was given.
+if test "${enable_experimental_malloc+set}" = set; then :
+  enableval=$enable_experimental_malloc; experimental_malloc=$enableval
+else
+  experimental_malloc=yes
+fi
+
+
+
  # Check whether --enable-nss-crypt was given.
  if test "${enable_nss_crypt+set}" = set; then :
    enableval=$enable_nss_crypt; nss_crypt=$enableval
diff --git a/configure.ac b/configure.ac

index 7e03019c4f66f6b050d4bddf0b2f7f35209f7db8..77456aa8d9d35ae557f7cfc4aa98ffd1349e0066 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch],
               [multi_arch=$enableval],
               [multi_arch=default])
  
+AC_ARG_ENABLE([experimental-malloc],
+             AC_HELP_STRING([--disable-experimental-malloc],
+                            [disable experimental malloc features]),
+             [experimental_malloc=$enableval],
+             [experimental_malloc=yes])
+AC_SUBST(experimental_malloc)
+
  AC_ARG_ENABLE([nss-crypt],
               AC_HELP_STRING([--enable-nss-crypt],
                              [enable libcrypt to use nss]),
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list

index df4f9622b4d89160205da0648249e7bccb087891..c188c6ad52d554119bcf54cf225ca0d526c4629f 100644 (file)
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -76,6 +76,18 @@ glibc {
        minval: 1
        security_level: SXID_IGNORE
      }
+    tcache_max {
+      type: SIZE_T
+      security_level: SXID_ERASE
+    }
+    tcache_count {
+      type: SIZE_T
+      security_level: SXID_ERASE
+    }
+    tcache_unsorted_limit {
+      type: SIZE_T
+      security_level: SXID_ERASE
+    }
    }
    tune {
      hwcap_mask {
diff --git a/malloc/Makefile b/malloc/Makefile

index b50de7cd6c5b2c0295d6394621abcdc89f75df24..3fa395b94932f074c91287bc5525303603eb50bb 100644 (file)
--- a/malloc/Makefile
+++ b/malloc/Makefile
@@ -189,6 +189,11 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
  tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
  tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
  
+ifeq ($(experimental-malloc),yes)
+CPPFLAGS-malloc.c += -DUSE_TCACHE=1
+else
+CPPFLAGS-malloc.c += -DUSE_TCACHE=0
+endif
  # Uncomment this for test releases.  For public releases it is too expensive.
  #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
  
diff --git a/malloc/arena.c b/malloc/arena.c

index 660d638c93a2f799060578817ba5ec3807ae3693..dc14fae152fd6e2129aa34e72e64d80a9690c650 100644 (file)
--- a/malloc/arena.c
+++ b/malloc/arena.c
@@ -236,6 +236,11 @@ TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
  TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
  TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
  TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
+#if USE_TCACHE
+TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
+TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
+TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
+#endif
  #else
  /* Initialization routine. */
  #include <string.h>
@@ -322,6 +327,12 @@ ptmalloc_init (void)
    TUNABLE_GET (mmap_max, int32_t, TUNABLE_CALLBACK (set_mmaps_max));
    TUNABLE_GET (arena_max, size_t, TUNABLE_CALLBACK (set_arena_max));
    TUNABLE_GET (arena_test, size_t, TUNABLE_CALLBACK (set_arena_test));
+#if USE_TCACHE
+  TUNABLE_GET (tcache_max, size_t, TUNABLE_CALLBACK (set_tcache_max));
+  TUNABLE_GET (tcache_count, size_t, TUNABLE_CALLBACK (set_tcache_count));
+  TUNABLE_GET (tcache_unsorted_limit, size_t,
+              TUNABLE_CALLBACK (set_tcache_unsorted_limit));
+#endif
    __libc_lock_unlock (main_arena.mutex);
  #else
    const char *s = NULL;
diff --git a/malloc/malloc.c b/malloc/malloc.c

index aa45626093b806fcb27be5698a986c8feeb7f2bb..2527e2504761744df2bdb1abdc02d936ff907ad2 100644 (file)
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -238,6 +238,9 @@
  /* For ALIGN_UP et. al.  */
  #include <libc-pointer-arith.h>
  
+/* For DIAG_PUSH/POP_NEEDS_COMMENT et al.  */
+#include <libc-diag.h>
+
  #include <malloc/malloc-internal.h>
  
  /*
@@ -296,6 +299,30 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
  }
  #endif
  
+#if USE_TCACHE
+/* We want 64 entries.  This is an arbitrary limit, which tunables can reduce.  */
+# define TCACHE_MAX_BINS               64
+# define MAX_TCACHE_SIZE       tidx2usize (TCACHE_MAX_BINS-1)
+
+/* Only used to pre-fill the tunables.  */
+# define tidx2usize(idx)       (((size_t) idx) * MALLOC_ALIGNMENT + MINSIZE - SIZE_SZ)
+
+/* When "x" is from chunksize().  */
+# define csize2tidx(x) (((x) - MINSIZE + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
+/* When "x" is a user-provided size.  */
+# define usize2tidx(x) csize2tidx (request2size (x))
+
+/* With rounding and alignment, the bins are...
+   idx 0   bytes 0..24 (64-bit) or 0..12 (32-bit)
+   idx 1   bytes 25..40 or 13..20
+   idx 2   bytes 41..56 or 21..28
+   etc.  */
+
+/* This is another arbitrary limit, which tunables can change.  Each
+   tcache bin will hold at most this number of chunks.  */
+# define TCACHE_FILL_COUNT 7
+#endif
+
  
  /*
    REALLOC_ZERO_BYTES_FREES should be set if a call to
@@ -1712,6 +1739,17 @@ struct malloc_par
  
    /* First address handed out by MORECORE/sbrk.  */
    char *sbrk_base;
+
+#if USE_TCACHE
+  /* Maximum number of buckets to use.  */
+  size_t tcache_bins;
+  size_t tcache_max_bytes;
+  /* Maximum number of chunks in each bucket.  */
+  size_t tcache_count;
+  /* Maximum number of chunks to remove from the unsorted list, which
+     aren't used to prefill the cache.  */
+  size_t tcache_unsorted_limit;
+#endif
  };
  
  /* There are several instances of this struct ("arenas") in this
@@ -1750,6 +1788,13 @@ static struct malloc_par mp_ =
    .trim_threshold = DEFAULT_TRIM_THRESHOLD,
  #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
    .arena_test = NARENAS_FROM_NCORES (1)
+#if USE_TCACHE
+  ,
+  .tcache_count = TCACHE_FILL_COUNT,
+  .tcache_bins = TCACHE_MAX_BINS,
+  .tcache_max_bytes = tidx2usize (TCACHE_MAX_BINS-1),
+  .tcache_unsorted_limit = 0 /* No limit.  */
+#endif
  };
  
  /* Maximum size of memory handled in fastbins.  */
@@ -2875,6 +2920,124 @@ mremap_chunk (mchunkptr p, size_t new_size)
  
  /*------------------------ Public wrappers. --------------------------------*/
  
+#if USE_TCACHE
+
+/* We overlay this structure on the user-data portion of a chunk when
+   the chunk is stored in the per-thread cache.  */
+typedef struct tcache_entry
+{
+  struct tcache_entry *next;
+} tcache_entry;
+
+/* There is one of these for each thread, which contains the
+   per-thread cache (hence "tcache_perthread_struct").  Keeping
+   overall size low is mildly important.  Note that COUNTS and ENTRIES
+   are redundant (we could have just counted the linked list each
+   time), this is for performance reasons.  */
+typedef struct tcache_perthread_struct
+{
+  char counts[TCACHE_MAX_BINS];
+  tcache_entry *entries[TCACHE_MAX_BINS];
+} tcache_perthread_struct;
+
+static __thread char tcache_shutting_down = 0;
+static __thread tcache_perthread_struct *tcache = NULL;
+
+/* Caller must ensure that we know tc_idx is valid and there's room
+   for more chunks.  */
+static void
+tcache_put (mchunkptr chunk, size_t tc_idx)
+{
+  tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
+  assert (tc_idx < TCACHE_MAX_BINS);
+  e->next = tcache->entries[tc_idx];
+  tcache->entries[tc_idx] = e;
+  ++(tcache->counts[tc_idx]);
+}
+
+/* Caller must ensure that we know tc_idx is valid and there's
+   available chunks to remove.  */
+static void *
+tcache_get (size_t tc_idx)
+{
+  tcache_entry *e = tcache->entries[tc_idx];
+  assert (tc_idx < TCACHE_MAX_BINS);
+  assert (tcache->entries[tc_idx] > 0);
+  tcache->entries[tc_idx] = e->next;
+  --(tcache->counts[tc_idx]);
+  return (void *) e;
+}
+
+static void __attribute__ ((section ("__libc_thread_freeres_fn")))
+tcache_thread_freeres (void)
+{
+  int i;
+  tcache_perthread_struct *tcache_tmp = tcache;
+
+  if (!tcache)
+    return;
+
+  tcache = NULL;
+
+  for (i = 0; i < TCACHE_MAX_BINS; ++i)
+    {
+      while (tcache_tmp->entries[i])
+       {
+         tcache_entry *e = tcache_tmp->entries[i];
+         tcache_tmp->entries[i] = e->next;
+         __libc_free (e);
+       }
+    }
+
+  __libc_free (tcache_tmp);
+
+  tcache_shutting_down = 1;
+}
+text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
+
+static void
+tcache_init(void)
+{
+  mstate ar_ptr;
+  void *victim = 0;
+  const size_t bytes = sizeof (tcache_perthread_struct);
+
+  if (tcache_shutting_down)
+    return;
+
+  arena_get (ar_ptr, bytes);
+  victim = _int_malloc (ar_ptr, bytes);
+  if (!victim && ar_ptr != NULL)
+    {
+      ar_ptr = arena_get_retry (ar_ptr, bytes);
+      victim = _int_malloc (ar_ptr, bytes);
+    }
+
+
+  if (ar_ptr != NULL)
+    __libc_lock_unlock (ar_ptr->mutex);
+
+  /* In a low memory situation, we may not be able to allocate memory
+     - in which case, we just keep trying later.  However, we
+     typically do this very early, so either there is sufficient
+     memory, or there isn't enough memory to do non-trivial
+     allocations anyway.  */
+  if (victim)
+    {
+      tcache = (tcache_perthread_struct *) victim;
+      memset (tcache, 0, sizeof (tcache_perthread_struct));
+    }
+
+}
+
+#define MAYBE_INIT_TCACHE() \
+  if (__glibc_unlikely (tcache == NULL)) \
+    tcache_init();
+
+#else
+#define MAYBE_INIT_TCACHE()
+#endif
+
  void *
  __libc_malloc (size_t bytes)
  {
@@ -2885,6 +3048,23 @@ __libc_malloc (size_t bytes)
      = atomic_forced_read (__malloc_hook);
    if (__builtin_expect (hook != NULL, 0))
      return (*hook)(bytes, RETURN_ADDRESS (0));
+#if USE_TCACHE
+  /* int_free also calls request2size, be careful to not pad twice.  */
+  size_t tbytes = request2size (bytes);
+  size_t tc_idx = csize2tidx (tbytes);
+
+  MAYBE_INIT_TCACHE ();
+
+  DIAG_PUSH_NEEDS_COMMENT;
+  if (tc_idx < mp_.tcache_bins
+      /*&& tc_idx < TCACHE_MAX_BINS*/ /* to appease gcc */
+      && tcache
+      && tcache->entries[tc_idx] != NULL)
+    {
+      return tcache_get (tc_idx);
+    }
+  DIAG_POP_NEEDS_COMMENT;
+#endif
  
    arena_get (ar_ptr, bytes);
  
@@ -2944,6 +3124,8 @@ __libc_free (void *mem)
        return;
      }
  
+  MAYBE_INIT_TCACHE ();
+
    ar_ptr = arena_for_chunk (p);
    _int_free (ar_ptr, p, 0);
  }
@@ -2981,7 +3163,10 @@ __libc_realloc (void *oldmem, size_t bytes)
    if (chunk_is_mmapped (oldp))
      ar_ptr = NULL;
    else
-    ar_ptr = arena_for_chunk (oldp);
+    {
+      MAYBE_INIT_TCACHE ();
+      ar_ptr = arena_for_chunk (oldp);
+    }
  
    /* Little security check which won't hurt performance: the allocator
       never wrapps around at the end of the address space.  Therefore
@@ -3206,6 +3391,8 @@ __libc_calloc (size_t n, size_t elem_size)
  
    sz = bytes;
  
+  MAYBE_INIT_TCACHE ();
+
    arena_get (av, sz);
    if (av)
      {
@@ -3336,6 +3523,10 @@ _int_malloc (mstate av, size_t bytes)
    mchunkptr fwd;                    /* misc temp for linking */
    mchunkptr bck;                    /* misc temp for linking */
  
+#if USE_TCACHE
+  size_t tcache_unsorted_count;            /* count of unsorted chunks processed */
+#endif
+
    const char *errstr = NULL;
  
    /*
@@ -3365,19 +3556,22 @@ _int_malloc (mstate av, size_t bytes)
       can try it without checking, which saves some time on this fast path.
     */
  
+#define REMOVE_FB(fb, victim, pp)                      \
+  do                                                   \
+    {                                                  \
+      victim = pp;                                     \
+      if (victim == NULL)                              \
+       break;                                          \
+    }                                                  \
+  while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim)) \
+        != victim);                                    \
+
    if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
      {
        idx = fastbin_index (nb);
        mfastbinptr *fb = &fastbin (av, idx);
        mchunkptr pp = *fb;
-      do
-        {
-          victim = pp;
-          if (victim == NULL)
-            break;
-        }
-      while ((pp = catomic_compare_and_exchange_val_acq (fb, victim->fd, victim))
-             != victim);
+      REMOVE_FB (fb, victim, pp);
        if (victim != 0)
          {
            if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
@@ -3388,6 +3582,26 @@ _int_malloc (mstate av, size_t bytes)
                return NULL;
              }
            check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+         /* While we're here, if we see other chunks of the same size,
+            stash them in the tcache.  */
+         size_t tc_idx = csize2tidx (nb);
+         if (tcache && tc_idx < mp_.tcache_bins)
+           {
+             mchunkptr tc_victim;
+
+             /* While bin not empty and tcache not full, copy chunks over.  */
+             while (tcache->counts[tc_idx] < mp_.tcache_count
+                    && (pp = *fb) != NULL)
+               {
+                 REMOVE_FB (fb, tc_victim, pp);
+                 if (tc_victim != 0)
+                   {
+                     tcache_put (tc_victim, tc_idx);
+                   }
+               }
+           }
+#endif
            void *p = chunk2mem (victim);
            alloc_perturb (p, bytes);
            return p;
@@ -3426,6 +3640,32 @@ _int_malloc (mstate av, size_t bytes)
                if (av != &main_arena)
                 set_non_main_arena (victim);
                check_malloced_chunk (av, victim, nb);
+#if USE_TCACHE
+         /* While we're here, if we see other chunks of the same size,
+            stash them in the tcache.  */
+         size_t tc_idx = csize2tidx (nb);
+         if (tcache && tc_idx < mp_.tcache_bins)
+           {
+             mchunkptr tc_victim;
+
+             /* While bin not empty and tcache not full, copy chunks over.  */
+             while (tcache->counts[tc_idx] < mp_.tcache_count
+                    && (tc_victim = last (bin)) != bin)
+               {
+                 if (tc_victim != 0)
+                   {
+                     bck = tc_victim->bk;
+                     set_inuse_bit_at_offset (tc_victim, nb);
+                     if (av != &main_arena)
+                       set_non_main_arena (tc_victim);
+                     bin->bk = bck;
+                     bck->fd = bin;
+
+                     tcache_put (tc_victim, tc_idx);
+                   }
+               }
+           }
+#endif
                void *p = chunk2mem (victim);
                alloc_perturb (p, bytes);
                return p;
@@ -3464,6 +3704,16 @@ _int_malloc (mstate av, size_t bytes)
       otherwise need to expand memory to service a "small" request.
     */
  
+#if USE_TCACHE
+  INTERNAL_SIZE_T tcache_nb = 0;
+  size_t tc_idx = csize2tidx (nb);
+  if (tcache && tc_idx < mp_.tcache_bins)
+    tcache_nb = nb;
+  int return_cached = 0;
+
+  tcache_unsorted_count = 0;
+#endif
+
    for (;; )
      {
        int iters = 0;
@@ -3524,10 +3774,26 @@ _int_malloc (mstate av, size_t bytes)
                set_inuse_bit_at_offset (victim, size);
                if (av != &main_arena)
                 set_non_main_arena (victim);
+#if USE_TCACHE
+             /* Fill cache first, return to user only if cache fills.
+                We may return one of these chunks later.  */
+             if (tcache_nb
+                 && tcache->counts[tc_idx] < mp_.tcache_count)
+               {
+                 tcache_put (victim, tc_idx);
+                 return_cached = 1;
+                 continue;
+               }
+             else
+               {
+#endif
                check_malloced_chunk (av, victim, nb);
                void *p = chunk2mem (victim);
                alloc_perturb (p, bytes);
                return p;
+#if USE_TCACHE
+               }
+#endif
              }
  
            /* place chunk in bin */
@@ -3594,11 +3860,31 @@ _int_malloc (mstate av, size_t bytes)
            fwd->bk = victim;
            bck->fd = victim;
  
+#if USE_TCACHE
+      /* If we've processed as many chunks as we're allowed while
+        filling the cache, return one of the cached ones.  */
+      ++tcache_unsorted_count;
+      if (return_cached
+         && mp_.tcache_unsorted_limit > 0
+         && tcache_unsorted_count > mp_.tcache_unsorted_limit)
+       {
+         return tcache_get (tc_idx);
+       }
+#endif
+
  #define MAX_ITERS       10000
            if (++iters >= MAX_ITERS)
              break;
          }
  
+#if USE_TCACHE
+      /* If all the small chunks we found ended up cached, return one now.  */
+      if (return_cached)
+       {
+         return tcache_get (tc_idx);
+       }
+#endif
+
        /*
           If a large request, scan through the chunks of current bin in
           sorted order to find smallest that fits.  Use the skip list for this.
@@ -3884,6 +4170,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
  
    check_inuse_chunk(av, p);
  
+#if USE_TCACHE
+  {
+    size_t tc_idx = csize2tidx (size);
+
+    if (tcache
+       && tc_idx < mp_.tcache_bins
+       && tcache->counts[tc_idx] < mp_.tcache_count)
+      {
+       tcache_put (p, tc_idx);
+       return;
+      }
+  }
+#endif
+
    /*
      If eligible, place chunk on a fastbin so it can be found
      and used quickly in malloc.
@@ -4845,6 +5145,38 @@ do_set_arena_max (size_t value)
    return 1;
  }
  
+#if USE_TCACHE
+static inline int
+__always_inline
+do_set_tcache_max (size_t value)
+{
+  if (value >= 0 && value <= MAX_TCACHE_SIZE)
+    {
+      LIBC_PROBE (memory_tunable_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
+      mp_.tcache_max_bytes = value;
+      mp_.tcache_bins = csize2tidx (request2size(value)) + 1;
+    }
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_count (size_t value)
+{
+  LIBC_PROBE (memory_tunable_tcache_count, 2, value, mp_.tcache_count);
+  mp_.tcache_count = value;
+  return 1;
+}
+
+static inline int
+__always_inline
+do_set_tcache_unsorted_limit (size_t value)
+{
+  LIBC_PROBE (memory_tunable_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
+  mp_.tcache_unsorted_limit = value;
+  return 1;
+}
+#endif
  
  int
  __libc_mallopt (int param_number, int value)
diff --git a/manual/install.texi b/manual/install.texi

index 03eb2dd93b7f2fdcc1bda9c254bcd5851592d08f..b8deb9ceba2eef3f7d90d5be81db03de39fe6b95 100644 (file)
--- a/manual/install.texi
+++ b/manual/install.texi
@@ -232,6 +232,12 @@ libnss_nisplus are not built at all.
  Use this option to enable libnsl with all depending NSS modules and
  header files.
  
+@item --disable-experimental-malloc
+By default, a per-thread cache is enabled in @code{malloc}.  While
+this cache can be disabled on a per-application basis using tunables
+(set glibc.malloc.tcache_count to zero), this option can be used to
+remove it from the build completely.
+
  @item --build=@var{build-system}
  @itemx --host=@var{host-system}
  These options are for cross-compiling.  If you specify both options and
diff --git a/manual/probes.texi b/manual/probes.texi

index eb91c62703f5cf2838a2f4dc77ba5459d85c9df4..96acaed20645b5ef209d296066d7a77d61d359df 100644 (file)
--- a/manual/probes.texi
+++ b/manual/probes.texi
@@ -231,6 +231,25 @@ dynamic brk/mmap thresholds.  Argument @var{$arg1} and @var{$arg2} are
  the adjusted mmap and trim thresholds, respectively.
  @end deftp
  
+@deftp Probe memory_tunable_tcache_max_bytes (int @var{$arg1}, int @var{$arg2})
+This probe is triggered when the @code{glibc.malloc.tcache_max}
+tunable is set.  Argument @var{$arg1} is the requested value, and
+@var{$arg2} is the previous value of this tunable.
+@end deftp
+
+@deftp Probe memory_tunable_tcache_count (int @var{$arg1}, int @var{$arg2})
+This probe is triggered when the @code{glibc.malloc.tcache_count}
+tunable is set.  Argument @var{$arg1} is the requested value, and
+@var{$arg2} is the previous value of this tunable.
+@end deftp
+
+@deftp Probe memory_tunable_tcache_unsorted_limit (int @var{$arg1}, int @var{$arg2})
+This probe is triggered when the
+@code{glibc.malloc.tcache_unsorted_limit} tunable is set.  Argument
+@var{$arg1} is the requested value, and @var{$arg2} is the previous
+value of this tunable.
+@end deftp
+
  @node Mathematical Function Probes
  @section Mathematical Function Probes
  
diff --git a/manual/tunables.texi b/manual/tunables.texi

index 9331b03702f8382c6ff5ab86b538875eb80b76fd..b16d591b907cebb54bbdadb26563c7fcf952945d 100644 (file)
--- a/manual/tunables.texi
+++ b/manual/tunables.texi
@@ -193,6 +193,38 @@ systems the limit is twice the number of cores online and on 64-bit systems, it
  is 8 times the number of cores online.
  @end deftp
  
+@deftp Tunable glibc.malloc.tcache_max
+The maximum size of a request (in bytes) which may be met via the
+per-thread cache.  The default (and maximum) value is 1032 bytes on
+64-bit systems and 516 bytes on 32-bit systems.
+@end deftp
+
+@deftp Tunable glibc.malloc.tcache_count
+The maximum number of chunks of each size to cache. The default is 7.
+There is no upper limit, other than available system memory.  If set
+to zero, the per-thread cache is effectively disabled.
+
+The approximate maximum overhead of the per-thread cache is thus equal
+to the number of bins times the chunk count in each bin times the size
+of each chunk.  With defaults, the approximate maximum overhead of the
+per-thread cache is approximately 236 KB on 64-bit systems and 118 KB
+on 32-bit systems.
+@end deftp
+
+@deftp Tunable glibc.malloc.tcache_unsorted_limit
+When the user requests memory and the request cannot be met via the
+per-thread cache, the arenas are used to meet the request.  At this
+time, additional chunks will be moved from existing arena lists to
+pre-fill the corresponding cache.  While copies from the fastbins,
+smallbins, and regular bins are bounded and predictable due to the bin
+sizes, copies from the unsorted bin are not bounded, and incur
+additional time penalties as they need to be sorted as they're
+scanned.  To make scanning the unsorted list more predictable and
+bounded, the user may set this tunable to limit the number of chunks
+that are scanned from the unsorted list while searching for chunks to
+pre-fill the per-thread cache with.  The default, or when set to zero,
+is no limit.
+
  @node Hardware Capability Tunables
  @section Hardware Capability Tunables
  @cindex hardware capability tunables
author	DJ Delorie <dj@delorie.com>
	Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)
committer	DJ Delorie <dj@delorie.com>
	Thu, 6 Jul 2017 17:37:30 +0000 (13:37 -0400)
ChangeLog		patch \| blob \| blame \| history
INSTALL		patch \| blob \| blame \| history
NEWS		patch \| blob \| blame \| history
config.make.in		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
elf/dl-tunables.list		patch \| blob \| blame \| history
malloc/Makefile		patch \| blob \| blame \| history
malloc/arena.c		patch \| blob \| blame \| history
malloc/malloc.c		patch \| blob \| blame \| history
manual/install.texi		patch \| blob \| blame \| history
manual/probes.texi		patch \| blob \| blame \| history
manual/tunables.texi		patch \| blob \| blame \| history