Update copyright dates with scripts/update-copyrights.

[thirdparty/glibc.git] / nscd / connections.c
diff --git a/nscd/connections.c b/nscd/connections.c

index 63a01e3bb6bffb7249cab3e88db5d9468abd791e..f3b16f7246eb864a49a43c90ce646f7724d9255a 100644 (file)
--- a/nscd/connections.c
+++ b/nscd/connections.c
@@ -1,22 +1,20 @@
  /* Inner loops of cache daemon.
-   Copyright (C) 1998-2003, 2004, 2005 Free Software Foundation, Inc.
+   Copyright (C) 1998-2016 Free Software Foundation, Inc.
     This file is part of the GNU C Library.
     Contributed by Ulrich Drepper <drepper@cygnus.com>, 1998.
  
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published
+   by the Free Software Foundation; version 2 of the License, or
+   (at your option) any later version.
  
-   The GNU C Library is distributed in the hope that it will be useful,
+   This program is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
  
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, see <http://www.gnu.org/licenses/>.  */
  
  #include <alloca.h>
  #include <assert.h>
@@ -25,6 +23,7 @@
  #include <errno.h>
  #include <fcntl.h>
  #include <grp.h>
+#include <ifaddrs.h>
  #include <libintl.h>
  #include <pthread.h>
  #include <pwd.h>
@@ -32,13 +31,24 @@
  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
+#include <stdint.h>
  #include <arpa/inet.h>
+#ifdef HAVE_NETLINK
+# include <linux/netlink.h>
+# include <linux/rtnetlink.h>
+#endif
  #ifdef HAVE_EPOLL
  # include <sys/epoll.h>
  #endif
+#ifdef HAVE_INOTIFY
+# include <sys/inotify.h>
+#endif
  #include <sys/mman.h>
  #include <sys/param.h>
  #include <sys/poll.h>
+#ifdef HAVE_SENDFILE
+# include <sys/sendfile.h>
+#endif
  #include <sys/socket.h>
  #include <sys/stat.h>
  #include <sys/un.h>
@@ -46,17 +56,12 @@
  #include "nscd.h"
  #include "dbg_log.h"
  #include "selinux.h"
+#include <resolv/resolv.h>
  
-
-/* Number of bytes of data we initially reserve for each hash table bucket.  */
-#define DEFAULT_DATASIZE_PER_BUCKET 1024
+#include <kernel-features.h>
+#include <libc-internal.h>
  
  
-/* Wrapper functions with error checking for standard functions.  */
-extern void *xmalloc (size_t n);
-extern void *xcalloc (size_t n, size_t s);
-extern void *xrealloc (void *o, size_t n);
-
  /* Support to run nscd as an unprivileged user */
  const char *server_user;
  static uid_t server_uid;
@@ -75,7 +80,7 @@ static void begin_drop_privileges (void);
  static void finish_drop_privileges (void);
  
  /* Map request type to a string.  */
-const char *serv2str[LASTREQ] =
+const char *const serv2str[LASTREQ] =
  {
    [GETPWBYNAME] = "GETPWBYNAME",
    [GETPWBYUID] = "GETPWBYUID",
@@ -92,7 +97,13 @@ const char *serv2str[LASTREQ] =
    [GETFDGR] = "GETFDGR",
    [GETFDHST] = "GETFDHST",
    [GETAI] = "GETAI",
-  [INITGROUPS] = "INITGROUPS"
+  [INITGROUPS] = "INITGROUPS",
+  [GETSERVBYNAME] = "GETSERVBYNAME",
+  [GETSERVBYPORT] = "GETSERVBYPORT",
+  [GETFDSERV] = "GETFDSERV",
+  [GETNETGRENT] = "GETNETGRENT",
+  [INNETGR] = "INNETGR",
+  [GETFDNETGR] = "GETFDNETGR"
  };
  
  /* The control data structures for the services.  */
@@ -100,11 +111,15 @@ struct database_dyn dbs[lastdb] =
  {
    [pwddb] = {
      .lock = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP,
+    .prune_lock = PTHREAD_MUTEX_INITIALIZER,
+    .prune_run_lock = PTHREAD_MUTEX_INITIALIZER,
      .enabled = 0,
      .check_file = 1,
      .persistent = 0,
+    .propagate = 1,
      .shared = 0,
-    .filename = "/etc/passwd",
+    .max_db_size = DEFAULT_MAX_DB_SIZE,
+    .suggested_module = DEFAULT_SUGGESTED_MODULE,
      .db_filename = _PATH_NSCD_PASSWD_DB,
      .disabled_iov = &pwd_iov_disabled,
      .postimeout = 3600,
@@ -115,11 +130,15 @@ struct database_dyn dbs[lastdb] =
    },
    [grpdb] = {
      .lock = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP,
+    .prune_lock = PTHREAD_MUTEX_INITIALIZER,
+    .prune_run_lock = PTHREAD_MUTEX_INITIALIZER,
      .enabled = 0,
      .check_file = 1,
      .persistent = 0,
+    .propagate = 1,
      .shared = 0,
-    .filename = "/etc/group",
+    .max_db_size = DEFAULT_MAX_DB_SIZE,
+    .suggested_module = DEFAULT_SUGGESTED_MODULE,
      .db_filename = _PATH_NSCD_GROUP_DB,
      .disabled_iov = &grp_iov_disabled,
      .postimeout = 3600,
@@ -130,11 +149,15 @@ struct database_dyn dbs[lastdb] =
    },
    [hstdb] = {
      .lock = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP,
+    .prune_lock = PTHREAD_MUTEX_INITIALIZER,
+    .prune_run_lock = PTHREAD_MUTEX_INITIALIZER,
      .enabled = 0,
      .check_file = 1,
      .persistent = 0,
+    .propagate = 0,            /* Not used.  */
      .shared = 0,
-    .filename = "/etc/hosts",
+    .max_db_size = DEFAULT_MAX_DB_SIZE,
+    .suggested_module = DEFAULT_SUGGESTED_MODULE,
      .db_filename = _PATH_NSCD_HOSTS_DB,
      .disabled_iov = &hst_iov_disabled,
      .postimeout = 3600,
@@ -142,33 +165,80 @@ struct database_dyn dbs[lastdb] =
      .wr_fd = -1,
      .ro_fd = -1,
      .mmap_used = false
+  },
+  [servdb] = {
+    .lock = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP,
+    .prune_lock = PTHREAD_MUTEX_INITIALIZER,
+    .prune_run_lock = PTHREAD_MUTEX_INITIALIZER,
+    .enabled = 0,
+    .check_file = 1,
+    .persistent = 0,
+    .propagate = 0,            /* Not used.  */
+    .shared = 0,
+    .max_db_size = DEFAULT_MAX_DB_SIZE,
+    .suggested_module = DEFAULT_SUGGESTED_MODULE,
+    .db_filename = _PATH_NSCD_SERVICES_DB,
+    .disabled_iov = &serv_iov_disabled,
+    .postimeout = 28800,
+    .negtimeout = 20,
+    .wr_fd = -1,
+    .ro_fd = -1,
+    .mmap_used = false
+  },
+  [netgrdb] = {
+    .lock = PTHREAD_RWLOCK_WRITER_NONRECURSIVE_INITIALIZER_NP,
+    .prune_lock = PTHREAD_MUTEX_INITIALIZER,
+    .prune_run_lock = PTHREAD_MUTEX_INITIALIZER,
+    .enabled = 0,
+    .check_file = 1,
+    .persistent = 0,
+    .propagate = 0,            /* Not used.  */
+    .shared = 0,
+    .max_db_size = DEFAULT_MAX_DB_SIZE,
+    .suggested_module = DEFAULT_SUGGESTED_MODULE,
+    .db_filename = _PATH_NSCD_NETGROUP_DB,
+    .disabled_iov = &netgroup_iov_disabled,
+    .postimeout = 28800,
+    .negtimeout = 20,
+    .wr_fd = -1,
+    .ro_fd = -1,
+    .mmap_used = false
    }
  };
  
  
  /* Mapping of request type to database.  */
-static struct database_dyn *const serv2db[LASTREQ] =
+static struct
+{
+  bool data_request;
+  struct database_dyn *db;
+} const reqinfo[LASTREQ] =
  {
-  [GETPWBYNAME] = &dbs[pwddb],
-  [GETPWBYUID] = &dbs[pwddb],
-  [GETGRBYNAME] = &dbs[grpdb],
-  [GETGRBYGID] = &dbs[grpdb],
-  [GETHOSTBYNAME] = &dbs[hstdb],
-  [GETHOSTBYNAMEv6] = &dbs[hstdb],
-  [GETHOSTBYADDR] = &dbs[hstdb],
-  [GETHOSTBYADDRv6] = &dbs[hstdb],
-  [GETFDPW] = &dbs[pwddb],
-  [GETFDGR] = &dbs[grpdb],
-  [GETFDHST] = &dbs[hstdb],
-  [GETAI] = &dbs[hstdb],
-  [INITGROUPS] = &dbs[grpdb]
+  [GETPWBYNAME] = { true, &dbs[pwddb] },
+  [GETPWBYUID] = { true, &dbs[pwddb] },
+  [GETGRBYNAME] = { true, &dbs[grpdb] },
+  [GETGRBYGID] = { true, &dbs[grpdb] },
+  [GETHOSTBYNAME] = { true, &dbs[hstdb] },
+  [GETHOSTBYNAMEv6] = { true, &dbs[hstdb] },
+  [GETHOSTBYADDR] = { true, &dbs[hstdb] },
+  [GETHOSTBYADDRv6] = { true, &dbs[hstdb] },
+  [SHUTDOWN] = { false, NULL },
+  [GETSTAT] = { false, NULL },
+  [SHUTDOWN] = { false, NULL },
+  [GETFDPW] = { false, &dbs[pwddb] },
+  [GETFDGR] = { false, &dbs[grpdb] },
+  [GETFDHST] = { false, &dbs[hstdb] },
+  [GETAI] = { true, &dbs[hstdb] },
+  [INITGROUPS] = { true, &dbs[grpdb] },
+  [GETSERVBYNAME] = { true, &dbs[servdb] },
+  [GETSERVBYPORT] = { true, &dbs[servdb] },
+  [GETFDSERV] = { false, &dbs[servdb] },
+  [GETNETGRENT] = { true, &dbs[netgrdb] },
+  [INNETGR] = { true, &dbs[netgrdb] },
+  [GETFDNETGR] = { false, &dbs[netgrdb] }
  };
  
  
-/* Number of seconds between two cache pruning runs.  */
-#define CACHE_PRUNE_INTERVAL   15
-
-
  /* Initial number of threads to use.  */
  int nthreads = -1;
  /* Maximum number of threads to use.  */
@@ -177,24 +247,269 @@ int max_nthreads = 32;
  /* Socket for incoming connections.  */
  static int sock;
  
+#ifdef HAVE_INOTIFY
+/* Inotify descriptor.  */
+int inotify_fd = -1;
+#endif
+
+#ifdef HAVE_NETLINK
+/* Descriptor for netlink status updates.  */
+static int nl_status_fd = -1;
+#endif
+
+#ifndef __ASSUME_ACCEPT4
+static int have_accept4;
+#endif
+
  /* Number of times clients had to wait.  */
  unsigned long int client_queued;
  
  
-/* Initialize database information structures.  */
-void
-nscd_init (void)
+ssize_t
+writeall (int fd, const void *buf, size_t len)
  {
-  struct sockaddr_un sock_addr;
-  size_t cnt;
+  size_t n = len;
+  ssize_t ret;
+  do
+    {
+      ret = TEMP_FAILURE_RETRY (send (fd, buf, n, MSG_NOSIGNAL));
+      if (ret <= 0)
+       break;
+      buf = (const char *) buf + ret;
+      n -= ret;
+    }
+  while (n > 0);
+  return ret < 0 ? ret : len - n;
+}
+
  
-  /* Secure mode and unprivileged mode are incompatible */
-  if (server_user != NULL && secure_in_use)
+#ifdef HAVE_SENDFILE
+ssize_t
+sendfileall (int tofd, int fromfd, off_t off, size_t len)
+{
+  ssize_t n = len;
+  ssize_t ret;
+
+  do
      {
-      dbg_log (_("Cannot run nscd in secure mode as unprivileged user"));
-      exit (1);
+      ret = TEMP_FAILURE_RETRY (sendfile (tofd, fromfd, &off, n));
+      if (ret <= 0)
+       break;
+      n -= ret;
      }
+  while (n > 0);
+  return ret < 0 ? ret : len - n;
+}
+#endif
+
+
+enum usekey
+  {
+    use_not = 0,
+    /* The following three are not really used, they are symbolic constants.  */
+    use_first = 16,
+    use_begin = 32,
+    use_end = 64,
+
+    use_he = 1,
+    use_he_begin = use_he | use_begin,
+    use_he_end = use_he | use_end,
+    use_data = 3,
+    use_data_begin = use_data | use_begin,
+    use_data_end = use_data | use_end,
+    use_data_first = use_data_begin | use_first
+  };
+
  
+static int
+check_use (const char *data, nscd_ssize_t first_free, uint8_t *usemap,
+          enum usekey use, ref_t start, size_t len)
+{
+  assert (len >= 2);
+
+  if (start > first_free || start + len > first_free
+      || (start & BLOCK_ALIGN_M1))
+    return 0;
+
+  if (usemap[start] == use_not)
+    {
+      /* Add the start marker.  */
+      usemap[start] = use | use_begin;
+      use &= ~use_first;
+
+      while (--len > 0)
+       if (usemap[++start] != use_not)
+         return 0;
+       else
+         usemap[start] = use;
+
+      /* Add the end marker.  */
+      usemap[start] = use | use_end;
+    }
+  else if ((usemap[start] & ~use_first) == ((use | use_begin) & ~use_first))
+    {
+      /* Hash entries can't be shared.  */
+      if (use == use_he)
+       return 0;
+
+      usemap[start] |= (use & use_first);
+      use &= ~use_first;
+
+      while (--len > 1)
+       if (usemap[++start] != use)
+         return 0;
+
+      if (usemap[++start] != (use | use_end))
+       return 0;
+    }
+  else
+    /* Points to a wrong object or somewhere in the middle.  */
+    return 0;
+
+  return 1;
+}
+
+
+/* Verify data in persistent database.  */
+static int
+verify_persistent_db (void *mem, struct database_pers_head *readhead, int dbnr)
+{
+  assert (dbnr == pwddb || dbnr == grpdb || dbnr == hstdb || dbnr == servdb
+         || dbnr == netgrdb);
+
+  time_t now = time (NULL);
+
+  struct database_pers_head *head = mem;
+  struct database_pers_head head_copy = *head;
+
+  /* Check that the header that was read matches the head in the database.  */
+  if (memcmp (head, readhead, sizeof (*head)) != 0)
+    return 0;
+
+  /* First some easy tests: make sure the database header is sane.  */
+  if (head->version != DB_VERSION
+      || head->header_size != sizeof (*head)
+      /* We allow a timestamp to be one hour ahead of the current time.
+        This should cover daylight saving time changes.  */
+      || head->timestamp > now + 60 * 60 + 60
+      || (head->gc_cycle & 1)
+      || head->module == 0
+      || (size_t) head->module > INT32_MAX / sizeof (ref_t)
+      || (size_t) head->data_size > INT32_MAX - head->module * sizeof (ref_t)
+      || head->first_free < 0
+      || head->first_free > head->data_size
+      || (head->first_free & BLOCK_ALIGN_M1) != 0
+      || head->maxnentries < 0
+      || head->maxnsearched < 0)
+    return 0;
+
+  uint8_t *usemap = calloc (head->first_free, 1);
+  if (usemap == NULL)
+    return 0;
+
+  const char *data = (char *) &head->array[roundup (head->module,
+                                                   ALIGN / sizeof (ref_t))];
+
+  nscd_ssize_t he_cnt = 0;
+  for (nscd_ssize_t cnt = 0; cnt < head->module; ++cnt)
+    {
+      ref_t trail = head->array[cnt];
+      ref_t work = trail;
+      int tick = 0;
+
+      while (work != ENDREF)
+       {
+         if (! check_use (data, head->first_free, usemap, use_he, work,
+                          sizeof (struct hashentry)))
+           goto fail;
+
+         /* Now we know we can dereference the record.  */
+         struct hashentry *here = (struct hashentry *) (data + work);
+
+         ++he_cnt;
+
+         /* Make sure the record is for this type of service.  */
+         if (here->type >= LASTREQ
+             || reqinfo[here->type].db != &dbs[dbnr])
+           goto fail;
+
+         /* Validate boolean field value.  */
+         if (here->first != false && here->first != true)
+           goto fail;
+
+         if (here->len < 0)
+           goto fail;
+
+         /* Now the data.  */
+         if (here->packet < 0
+             || here->packet > head->first_free
+             || here->packet + sizeof (struct datahead) > head->first_free)
+           goto fail;
+
+         struct datahead *dh = (struct datahead *) (data + here->packet);
+
+         if (! check_use (data, head->first_free, usemap,
+                          use_data | (here->first ? use_first : 0),
+                          here->packet, dh->allocsize))
+           goto fail;
+
+         if (dh->allocsize < sizeof (struct datahead)
+             || dh->recsize > dh->allocsize
+             || (dh->notfound != false && dh->notfound != true)
+             || (dh->usable != false && dh->usable != true))
+           goto fail;
+
+         if (here->key < here->packet + sizeof (struct datahead)
+             || here->key > here->packet + dh->allocsize
+             || here->key + here->len > here->packet + dh->allocsize)
+           goto fail;
+
+         work = here->next;
+
+         if (work == trail)
+           /* A circular list, this must not happen.  */
+           goto fail;
+         if (tick)
+           trail = ((struct hashentry *) (data + trail))->next;
+         tick = 1 - tick;
+       }
+    }
+
+  if (he_cnt != head->nentries)
+    goto fail;
+
+  /* See if all data and keys had at least one reference from
+     he->first == true hashentry.  */
+  for (ref_t idx = 0; idx < head->first_free; ++idx)
+    {
+      if (usemap[idx] == use_data_begin)
+       goto fail;
+    }
+
+  /* Finally, make sure the database hasn't changed since the first test.  */
+  if (memcmp (mem, &head_copy, sizeof (*head)) != 0)
+    goto fail;
+
+  free (usemap);
+  return 1;
+
+fail:
+  free (usemap);
+  return 0;
+}
+
+
+#ifdef O_CLOEXEC
+# define EXTRA_O_FLAGS O_CLOEXEC
+#else
+# define EXTRA_O_FLAGS 0
+#endif
+
+
+/* Initialize database information structures.  */
+void
+nscd_init (void)
+{
    /* Look up unprivileged uid/gid/groups before we start listening on the
       socket  */
    if (server_user != NULL)
@@ -202,9 +517,9 @@ nscd_init (void)
  
    if (nthreads == -1)
      /* No configuration for this value, assume a default.  */
-    nthreads = 2 * lastdb;
+    nthreads = 4;
  
-  for (cnt = 0; cnt < lastdb; ++cnt)
+  for (size_t cnt = 0; cnt < lastdb; ++cnt)
      if (dbs[cnt].enabled)
        {
         pthread_rwlock_init (&dbs[cnt].lock, NULL);
@@ -213,9 +528,10 @@ nscd_init (void)
         if (dbs[cnt].persistent)
           {
             /* Try to open the appropriate file on disk.  */
-           int fd = open (dbs[cnt].db_filename, O_RDWR);
+           int fd = open (dbs[cnt].db_filename, O_RDWR | EXTRA_O_FLAGS);
             if (fd != -1)
               {
+               char *msg = NULL;
                 struct stat64 st;
                 void *mem;
                 size_t total;
@@ -224,38 +540,54 @@ nscd_init (void)
                                                       sizeof (head)));
                 if (n != sizeof (head) || fstat64 (fd, &st) != 0)
                   {
+                 fail_db_errno:
+                   /* The code is single-threaded at this point so
+                      using strerror is just fine.  */
+                   msg = strerror (errno);
                   fail_db:
                     dbg_log (_("invalid persistent database file \"%s\": %s"),
-                            dbs[cnt].db_filename, strerror (errno));
-                   dbs[cnt].persistent = 0;
+                            dbs[cnt].db_filename, msg);
+                   unlink (dbs[cnt].db_filename);
                   }
                 else if (head.module == 0 && head.data_size == 0)
                   {
-                   /* The file has been created, but the head has not been
-                      initialized yet.  Remove the old file.  */
-                   unlink (dbs[cnt].db_filename);
+                   /* The file has been created, but the head has not
+                      been initialized yet.  */
+                   msg = _("uninitialized header");
+                   goto fail_db;
                   }
                 else if (head.header_size != (int) sizeof (head))
                   {
-                   dbg_log (_("invalid persistent database file \"%s\": %s"),
-                            dbs[cnt].db_filename,
-                            _("header size does not match"));
-                   dbs[cnt].persistent = 0;
+                   msg = _("header size does not match");
+                   goto fail_db;
                   }
                 else if ((total = (sizeof (head)
                                    + roundup (head.module * sizeof (ref_t),
                                               ALIGN)
                                    + head.data_size))
-                        > st.st_size)
+                        > st.st_size
+                        || total < sizeof (head))
                   {
-                   dbg_log (_("invalid persistent database file \"%s\": %s"),
-                            dbs[cnt].db_filename,
-                            _("file size does not match"));
-                   dbs[cnt].persistent = 0;
+                   msg = _("file size does not match");
+                   goto fail_db;
+                 }
+               /* Note we map with the maximum size allowed for the
+                  database.  This is likely much larger than the
+                  actual file size.  This is OK on most OSes since
+                  extensions of the underlying file will
+                  automatically translate more pages available for
+                  memory access.  */
+               else if ((mem = mmap (NULL, dbs[cnt].max_db_size,
+                                     PROT_READ | PROT_WRITE,
+                                     MAP_SHARED, fd, 0))
+                        == MAP_FAILED)
+                 goto fail_db_errno;
+               else if (!verify_persistent_db (mem, &head, cnt))
+                 {
+                   munmap (mem, total);
+                   msg = _("verification failed");
+                   goto fail_db;
                   }
-               else if ((mem = mmap (NULL, total, PROT_READ | PROT_WRITE,
-                                     MAP_SHARED, fd, 0)) == MAP_FAILED)
-                 goto fail_db;
                 else
                   {
                     /* Success.  We have the database.  */
@@ -275,7 +607,8 @@ nscd_init (void)
                     /* We also need a read-only descriptor.  */
                     if (dbs[cnt].shared)
                       {
-                       dbs[cnt].ro_fd = open (dbs[cnt].db_filename, O_RDONLY);
+                       dbs[cnt].ro_fd = open (dbs[cnt].db_filename,
+                                              O_RDONLY | EXTRA_O_FLAGS);
                         if (dbs[cnt].ro_fd == -1)
                           dbg_log (_("\
  cannot create read-only descriptor for \"%s\"; no mmap"),
@@ -292,6 +625,9 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
                 if (fd != -1)
                   close (fd);
               }
+           else if (errno == EACCES)
+             do_exit (EXIT_FAILURE, 0, _("cannot access '%s'"),
+                      dbs[cnt].db_filename);
           }
  
         if (dbs[cnt].head == NULL)
@@ -312,22 +648,23 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
             if (dbs[cnt].persistent)
               {
                 fd = open (dbs[cnt].db_filename,
-                          O_RDWR | O_CREAT | O_EXCL | O_TRUNC,
+                          O_RDWR | O_CREAT | O_EXCL | O_TRUNC | EXTRA_O_FLAGS,
                            S_IRUSR | S_IWUSR);
                 if (fd != -1 && dbs[cnt].shared)
-                 ro_fd = open (dbs[cnt].db_filename, O_RDONLY);
+                 ro_fd = open (dbs[cnt].db_filename,
+                               O_RDONLY | EXTRA_O_FLAGS);
               }
             else
               {
                 char fname[] = _PATH_NSCD_XYZ_DB_TMP;
-               fd = mkstemp (fname);
+               fd = mkostemp (fname, EXTRA_O_FLAGS);
  
                 /* We do not need the file name anymore after we
                    opened another file descriptor in read-only mode.  */
                 if (fd != -1)
                   {
                     if (dbs[cnt].shared)
-                     ro_fd = open (fname, O_RDONLY);
+                     ro_fd = open (fname, O_RDONLY | EXTRA_O_FLAGS);
  
                     unlink (fname);
                   }
@@ -339,8 +676,7 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
                   {
                     dbg_log (_("database for %s corrupted or simultaneously used; remove %s manually if necessary and restart"),
                              dbnames[cnt], dbs[cnt].db_filename);
-                   // XXX Correct way to terminate?
-                   exit (1);
+                   do_exit (1, 0, NULL);
                   }
  
                 if  (dbs[cnt].persistent)
@@ -362,8 +698,8 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
  cannot create read-only descriptor for \"%s\"; no mmap"),
                            dbs[cnt].db_filename);
  
-               /* Before we create the header, initialiye the hash
-                  table.  So that if we get interrupted if writing
+               /* Before we create the header, initialize the hash
+                  table.  That way if we get interrupted while writing
                    the header we can recognize a partially initialized
                    database.  */
                 size_t ps = sysconf (_SC_PAGESIZE);
@@ -378,20 +714,23 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
                 if (offset % ps != 0)
                   {
                     towrite = MIN (remaining, ps - (offset % ps));
-                   pwrite (fd, tmpbuf, towrite, offset);
+                   if (pwrite (fd, tmpbuf, towrite, offset) != towrite)
+                     goto write_fail;
                     offset += towrite;
                     remaining -= towrite;
                   }
  
                 while (remaining > ps)
                   {
-                   pwrite (fd, tmpbuf, ps, offset);
+                   if (pwrite (fd, tmpbuf, ps, offset) == -1)
+                     goto write_fail;
                     offset += ps;
                     remaining -= ps;
                   }
  
-               if (remaining > 0)
-                 pwrite (fd, tmpbuf, remaining, offset);
+               if (remaining > 0
+                   && pwrite (fd, tmpbuf, remaining, offset) != remaining)
+                 goto write_fail;
  
                 /* Create the header of the file.  */
                 struct database_pers_head head =
@@ -407,10 +746,13 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
  
                 if ((TEMP_FAILURE_RETRY (write (fd, &head, sizeof (head)))
                      != sizeof (head))
-                   || ftruncate (fd, total) != 0
-                   || (mem = mmap (NULL, total, PROT_READ | PROT_WRITE,
+                   || (TEMP_FAILURE_RETRY_VAL (posix_fallocate (fd, 0, total))
+                       != 0)
+                   || (mem = mmap (NULL, dbs[cnt].max_db_size,
+                                   PROT_READ | PROT_WRITE,
                                     MAP_SHARED, fd, 0)) == MAP_FAILED)
                   {
+                 write_fail:
                     unlink (dbs[cnt].db_filename);
                     dbg_log (_("cannot write to database file %s: %s"),
                              dbs[cnt].db_filename, strerror (errno));
@@ -440,6 +782,11 @@ cannot create read-only descriptor for \"%s\"; no mmap"),
               }
           }
  
+#if !defined O_CLOEXEC || !defined __ASSUME_O_CLOEXEC
+       /* We do not check here whether the O_CLOEXEC provided to the
+          open call was successful or not.  The two fcntl calls are
+          only performed once each per process start-up and therefore
+          is not noticeable at all.  */
         if (paranoia
             && ((dbs[cnt].wr_fd != -1
                  && fcntl (dbs[cnt].wr_fd, F_SETFD, FD_CLOEXEC) == -1)
@@ -451,6 +798,7 @@ cannot set socket to close on exec: %s; disabling paranoia mode"),
                      strerror (errno));
             paranoia = 0;
           }
+#endif
  
         if (dbs[cnt].head == NULL)
           {
@@ -461,7 +809,7 @@ cannot set socket to close on exec: %s; disabling paranoia mode"),
             dbs[cnt].head = xmalloc (sizeof (struct database_pers_head)
                                      + (dbs[cnt].suggested_module
                                         * sizeof (ref_t)));
-           memset (dbs[cnt].head, '\0', sizeof (dbs[cnt].head));
+           memset (dbs[cnt].head, '\0', sizeof (struct database_pers_head));
             assert (~ENDREF == 0);
             memset (dbs[cnt].head->array, '\xff',
                     dbs[cnt].suggested_module * sizeof (ref_t));
@@ -474,55 +822,23 @@ cannot set socket to close on exec: %s; disabling paranoia mode"),
             dbs[cnt].shared = 0;
             assert (dbs[cnt].ro_fd == -1);
           }
-
-       if (dbs[cnt].check_file)
-         {
-           /* We need the modification date of the file.  */
-           struct stat st;
-
-           if (stat (dbs[cnt].filename, &st) < 0)
-             {
-               /* We cannot stat() the file, disable file checking.  */
-               dbg_log (_("cannot stat() file `%s': %s"),
-                        dbs[cnt].filename, strerror (errno));
-               dbs[cnt].check_file = 0;
-             }
-           else
-             dbs[cnt].file_mtime = st.st_mtime;
-         }
        }
  
    /* Create the socket.  */
-  sock = socket (AF_UNIX, SOCK_STREAM, 0);
+  sock = socket (AF_UNIX, SOCK_STREAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0);
    if (sock < 0)
      {
        dbg_log (_("cannot open socket: %s"), strerror (errno));
-      exit (1);
+      do_exit (errno == EACCES ? 4 : 1, 0, NULL);
      }
    /* Bind a name to the socket.  */
+  struct sockaddr_un sock_addr;
    sock_addr.sun_family = AF_UNIX;
    strcpy (sock_addr.sun_path, _PATH_NSCDSOCKET);
    if (bind (sock, (struct sockaddr *) &sock_addr, sizeof (sock_addr)) < 0)
      {
        dbg_log ("%s: %s", _PATH_NSCDSOCKET, strerror (errno));
-      exit (1);
-    }
-
-  /* We don't want to get stuck on accept.  */
-  int fl = fcntl (sock, F_GETFL);
-  if (fl == -1 || fcntl (sock, F_SETFL, fl | O_NONBLOCK) == -1)
-    {
-      dbg_log (_("cannot change socket to nonblocking mode: %s"),
-              strerror (errno));
-      exit (1);
-    }
-
-  /* The descriptor needs to be closed on exec.  */
-  if (paranoia && fcntl (sock, F_SETFD, FD_CLOEXEC) == -1)
-    {
-      dbg_log (_("cannot set socket to close on exec: %s"),
-              strerror (errno));
-      exit (1);
+      do_exit (errno == EACCES ? 4 : 1, 0, NULL);
      }
  
    /* Set permissions for the socket.  */
@@ -533,14 +849,132 @@ cannot set socket to close on exec: %s; disabling paranoia mode"),
      {
        dbg_log (_("cannot enable socket to accept connections: %s"),
                strerror (errno));
-      exit (1);
+      do_exit (1, 0, NULL);
+    }
+
+#ifdef HAVE_NETLINK
+  if (dbs[hstdb].enabled)
+    {
+      /* Try to open netlink socket to monitor network setting changes.  */
+      nl_status_fd = socket (AF_NETLINK,
+                            SOCK_RAW | SOCK_CLOEXEC | SOCK_NONBLOCK,
+                            NETLINK_ROUTE);
+      if (nl_status_fd != -1)
+       {
+         struct sockaddr_nl snl;
+         memset (&snl, '\0', sizeof (snl));
+         snl.nl_family = AF_NETLINK;
+         /* XXX Is this the best set to use?  */
+         snl.nl_groups = (RTMGRP_IPV4_IFADDR | RTMGRP_TC | RTMGRP_IPV4_MROUTE
+                          | RTMGRP_IPV4_ROUTE | RTMGRP_IPV4_RULE
+                          | RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_MROUTE
+                          | RTMGRP_IPV6_ROUTE | RTMGRP_IPV6_IFINFO
+                          | RTMGRP_IPV6_PREFIX);
+
+         if (bind (nl_status_fd, (struct sockaddr *) &snl, sizeof (snl)) != 0)
+           {
+             close (nl_status_fd);
+             nl_status_fd = -1;
+           }
+         else
+           {
+             /* Start the timestamp process.  */
+             dbs[hstdb].head->extra_data[NSCD_HST_IDX_CONF_TIMESTAMP]
+               = __bump_nl_timestamp ();
+           }
+       }
      }
+#endif
  
-  /* Change to unprivileged uid/gid/groups if specifed in config file */
+  /* Change to unprivileged uid/gid/groups if specified in config file */
    if (server_user != NULL)
      finish_drop_privileges ();
  }
  
+#ifdef HAVE_INOTIFY
+#define TRACED_FILE_MASK (IN_DELETE_SELF | IN_CLOSE_WRITE | IN_MOVE_SELF)
+#define TRACED_DIR_MASK (IN_DELETE_SELF | IN_CREATE | IN_MOVED_TO | IN_MOVE_SELF)
+void
+install_watches (struct traced_file *finfo)
+{
+  /* Use inotify support if we have it.  */
+  if (finfo->inotify_descr[TRACED_FILE] < 0)
+    finfo->inotify_descr[TRACED_FILE] = inotify_add_watch (inotify_fd,
+                                                          finfo->fname,
+                                                          TRACED_FILE_MASK);
+  if (finfo->inotify_descr[TRACED_FILE] < 0)
+    {
+      dbg_log (_("disabled inotify-based monitoring for file `%s': %s"),
+                finfo->fname, strerror (errno));
+      return;
+    }
+  dbg_log (_("monitoring file `%s` (%d)"),
+          finfo->fname, finfo->inotify_descr[TRACED_FILE]);
+  /* Additionally listen for events in the file's parent directory.
+     We do this because the file to be watched might be
+     deleted and then added back again.  When it is added back again
+     we must re-add the watch.  We must also cover IN_MOVED_TO to
+     detect a file being moved into the directory.  */
+  if (finfo->inotify_descr[TRACED_DIR] < 0)
+    finfo->inotify_descr[TRACED_DIR] = inotify_add_watch (inotify_fd,
+                                                         finfo->dname,
+                                                         TRACED_DIR_MASK);
+  if (finfo->inotify_descr[TRACED_DIR] < 0)
+    {
+      dbg_log (_("disabled inotify-based monitoring for directory `%s': %s"),
+                finfo->fname, strerror (errno));
+      return;
+    }
+  dbg_log (_("monitoring directory `%s` (%d)"),
+          finfo->dname, finfo->inotify_descr[TRACED_DIR]);
+}
+#endif
+
+/* Register the file in FINFO as a traced file for the database DBS[DBIX].
+
+   We support registering multiple files per database. Each call to
+   register_traced_file adds to the list of registered files.
+
+   When we prune the database, either through timeout or a request to
+   invalidate, we will check to see if any of the registered files has changed.
+   When we accept new connections to handle a cache request we will also
+   check to see if any of the registered files has changed.
+
+   If we have inotify support then we install an inotify fd to notify us of
+   file deletion or modification, both of which will require we invalidate
+   the cache for the database.  Without inotify support we stat the file and
+   store st_mtime to determine if the file has been modified.  */
+void
+register_traced_file (size_t dbidx, struct traced_file *finfo)
+{
+  /* If the database is disabled or file checking is disabled
+     then ignore the registration.  */
+  if (! dbs[dbidx].enabled || ! dbs[dbidx].check_file)
+    return;
+
+  if (__glibc_unlikely (debug_level > 0))
+    dbg_log (_("monitoring file %s for database %s"),
+            finfo->fname, dbnames[dbidx]);
+
+#ifdef HAVE_INOTIFY
+  install_watches (finfo);
+#endif
+  struct stat64 st;
+  if (stat64 (finfo->fname, &st) < 0)
+    {
+      /* We cannot stat() the file. Set mtime to zero and try again later.  */
+      dbg_log (_("stat failed for file `%s'; will try again later: %s"),
+              finfo->fname, strerror (errno));
+      finfo->mtime = 0;
+    }
+  else
+    finfo->mtime = st.st_mtime;
+
+  /* Queue up the file name.  */
+  finfo->next = dbs[dbidx].traced_files;
+  dbs[dbidx].traced_files = finfo;
+}
+
  
  /* Close the connections.  */
  void
@@ -551,26 +985,54 @@ close_sockets (void)
  
  
  static void
-invalidate_cache (char *key)
+invalidate_cache (char *key, int fd)
  {
    dbtype number;
+  int32_t resp;
  
-  if (strcmp (key, "passwd") == 0)
-    number = pwddb;
-  else if (strcmp (key, "group") == 0)
-    number = grpdb;
-  else if (__builtin_expect (strcmp (key, "hosts"), 0) == 0)
-    {
-      number = hstdb;
+  for (number = pwddb; number < lastdb; ++number)
+    if (strcmp (key, dbnames[number]) == 0)
+      {
+       struct traced_file *runp = dbs[number].traced_files;
+       while (runp != NULL)
+         {
+           /* Make sure we reload from file when checking mtime.  */
+           runp->mtime = 0;
+#ifdef HAVE_INOTIFY
+           /* During an invalidation we try to reload the traced
+              file watches.  This allows the user to re-sync if
+              inotify events were lost.  Similar to what we do during
+              pruning.  */
+           install_watches (runp);
+#endif
+           if (runp->call_res_init)
+             {
+               res_init ();
+               break;
+             }
+           runp = runp->next;
+         }
+       break;
+      }
  
-      /* Re-initialize the resolver.  resolv.conf might have changed.  */
-      res_init ();
+  if (number == lastdb)
+    {
+      resp = EINVAL;
+      writeall (fd, &resp, sizeof (resp));
+      return;
      }
-  else
-    return;
  
    if (dbs[number].enabled)
-    prune_cache (&dbs[number], LONG_MAX);
+    {
+      pthread_mutex_lock (&dbs[number].prune_run_lock);
+      prune_cache (&dbs[number], LONG_MAX, fd);
+      pthread_mutex_unlock (&dbs[number].prune_run_lock);
+    }
+  else
+    {
+      resp = 0;
+      writeall (fd, &resp, sizeof (resp));
+    }
  }
  
  
@@ -583,21 +1045,32 @@ send_ro_fd (struct database_dyn *db, char *key, int fd)
      return;
  
    /* We need to send some data along with the descriptor.  */
-  struct iovec iov[1];
+  uint64_t mapsize = (db->head->data_size
+                     + roundup (db->head->module * sizeof (ref_t), ALIGN)
+                     + sizeof (struct database_pers_head));
+  struct iovec iov[2];
    iov[0].iov_base = key;
    iov[0].iov_len = strlen (key) + 1;
+  iov[1].iov_base = &mapsize;
+  iov[1].iov_len = sizeof (mapsize);
  
    /* Prepare the control message to transfer the descriptor.  */
-  char buf[CMSG_SPACE (sizeof (int))];
-  struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 1,
-                       .msg_control = buf, .msg_controllen = sizeof (buf) };
+  union
+  {
+    struct cmsghdr hdr;
+    char bytes[CMSG_SPACE (sizeof (int))];
+  } buf;
+  struct msghdr msg = { .msg_iov = iov, .msg_iovlen = 2,
+                       .msg_control = buf.bytes,
+                       .msg_controllen = sizeof (buf) };
    struct cmsghdr *cmsg = CMSG_FIRSTHDR (&msg);
  
    cmsg->cmsg_level = SOL_SOCKET;
    cmsg->cmsg_type = SCM_RIGHTS;
    cmsg->cmsg_len = CMSG_LEN (sizeof (int));
  
-  *(int *) CMSG_DATA (cmsg) = db->ro_fd;
+  int *ip = (int *) CMSG_DATA (cmsg);
+  *ip = db->ro_fd;
  
    msg.msg_controllen = cmsg->cmsg_len;
  
@@ -608,7 +1081,7 @@ send_ro_fd (struct database_dyn *db, char *key, int fd)
  #endif
    (void) TEMP_FAILURE_RETRY (sendmsg (fd, &msg, MSG_NOSIGNAL));
  
-  if (__builtin_expect (debug_level > 0, 0))
+  if (__glibc_unlikely (debug_level > 0))
      dbg_log (_("provide access to FD %d, for %s"), db->ro_fd, key);
  }
  #endif /* SCM_RIGHTS */
@@ -616,7 +1089,7 @@ send_ro_fd (struct database_dyn *db, char *key, int fd)
  
  /* Handle new request.  */
  static void
-handle_request (int fd, request_header *req, void *key, uid_t uid)
+handle_request (int fd, request_header *req, void *key, uid_t uid, pid_t pid)
  {
    if (__builtin_expect (req->version, NSCD_VERSION) != NSCD_VERSION)
      {
@@ -627,22 +1100,42 @@ cannot handle old request version %d; current version is %d"),
        return;
      }
  
-  /* Make the SELinux check before we go on to the standard checks.  We
-     need to verify that the request type is valid, since it has not
-     yet been checked at this point.  */
-  if (selinux_enabled
-      && __builtin_expect (req->type, GETPWBYNAME) >= GETPWBYNAME
-      && __builtin_expect (req->type, LASTREQ) < LASTREQ
-      && nscd_request_avc_has_perm (fd, req->type) != 0)
-    return;
+  /* Perform the SELinux check before we go on to the standard checks.  */
+  if (selinux_enabled && nscd_request_avc_has_perm (fd, req->type) != 0)
+    {
+      if (debug_level > 0)
+       {
+#ifdef SO_PEERCRED
+# ifdef PATH_MAX
+         char buf[PATH_MAX];
+# else
+         char buf[4096];
+# endif
  
-  struct database_dyn *db = serv2db[req->type];
+         snprintf (buf, sizeof (buf), "/proc/%ld/exe", (long int) pid);
+         ssize_t n = readlink (buf, buf, sizeof (buf) - 1);
  
-  // XXX Clean up so that each new command need not introduce a
-  // XXX new conditional.
-  if ((__builtin_expect (req->type, GETPWBYNAME) >= GETPWBYNAME
-       && __builtin_expect (req->type, LASTDBREQ) <= LASTDBREQ)
-      || req->type == GETAI || req->type == INITGROUPS)
+         if (n <= 0)
+           dbg_log (_("\
+request from %ld not handled due to missing permission"), (long int) pid);
+         else
+           {
+             buf[n] = '\0';
+             dbg_log (_("\
+request from '%s' [%ld] not handled due to missing permission"),
+                      buf, (long int) pid);
+           }
+#else
+         dbg_log (_("request not handled due to missing permission"));
+#endif
+       }
+      return;
+    }
+
+  struct database_dyn *db = reqinfo[req->type].db;
+
+  /* See whether we can service the request from the cache.  */
+  if (__builtin_expect (reqinfo[req->type].data_request, true))
      {
        if (__builtin_expect (debug_level, 0) > 0)
         {
@@ -660,11 +1153,12 @@ cannot handle old request version %d; current version is %d"),
         }
  
        /* Is this service enabled?  */
-      if (!db->enabled)
+      if (__glibc_unlikely (!db->enabled))
         {
           /* No, sent the prepared record.  */
-         if (TEMP_FAILURE_RETRY (write (fd, db->disabled_iov->iov_base,
-                                        db->disabled_iov->iov_len))
+         if (TEMP_FAILURE_RETRY (send (fd, db->disabled_iov->iov_base,
+                                       db->disabled_iov->iov_len,
+                                       MSG_NOSIGNAL))
               != (ssize_t) db->disabled_iov->iov_len
               && __builtin_expect (debug_level, 0) > 0)
             {
@@ -678,7 +1172,7 @@ cannot handle old request version %d; current version is %d"),
         }
  
        /* Be sure we can read the data.  */
-      if (__builtin_expect (pthread_rwlock_tryrdlock (&db->lock) != 0, 0))
+      if (__glibc_unlikely (pthread_rwlock_tryrdlock (&db->lock) != 0))
         {
           ++db->head->rdlockdelayed;
           pthread_rwlock_rdlock (&db->lock);
@@ -691,8 +1185,34 @@ cannot handle old request version %d; current version is %d"),
        if (cached != NULL)
         {
           /* Hurray it's in the cache.  */
-         if (TEMP_FAILURE_RETRY (write (fd, cached->data, cached->recsize))
-             != cached->recsize
+         ssize_t nwritten;
+
+#ifdef HAVE_SENDFILE
+         if (__glibc_likely (db->mmap_used))
+           {
+             assert (db->wr_fd != -1);
+             assert ((char *) cached->data > (char *) db->data);
+             assert ((char *) cached->data - (char *) db->head
+                     + cached->recsize
+                     <= (sizeof (struct database_pers_head)
+                         + db->head->module * sizeof (ref_t)
+                         + db->head->data_size));
+             nwritten = sendfileall (fd, db->wr_fd,
+                                     (char *) cached->data
+                                     - (char *) db->head, cached->recsize);
+# ifndef __ASSUME_SENDFILE
+             if (nwritten == -1 && errno == ENOSYS)
+               goto use_write;
+# endif
+           }
+         else
+# ifndef __ASSUME_SENDFILE
+         use_write:
+# endif
+#endif
+           nwritten = writeall (fd, cached->data, cached->recsize);
+
+         if (nwritten != cached->recsize
               && __builtin_expect (debug_level, 0) > 0)
             {
               /* We have problems sending the result.  */
@@ -759,32 +1279,47 @@ cannot handle old request version %d; current version is %d"),
        addinitgroups (db, fd, req, key, uid);
        break;
  
+    case GETSERVBYNAME:
+      addservbyname (db, fd, req, key, uid);
+      break;
+
+    case GETSERVBYPORT:
+      addservbyport (db, fd, req, key, uid);
+      break;
+
+    case GETNETGRENT:
+      addgetnetgrent (db, fd, req, key, uid);
+      break;
+
+    case INNETGR:
+      addinnetgr (db, fd, req, key, uid);
+      break;
+
      case GETSTAT:
      case SHUTDOWN:
      case INVALIDATE:
-      if (! secure_in_use)
-       {
-         /* Get the callers credentials.  */
+      {
+       /* Get the callers credentials.  */
  #ifdef SO_PEERCRED
-         struct ucred caller;
-         socklen_t optlen = sizeof (caller);
+       struct ucred caller;
+       socklen_t optlen = sizeof (caller);
  
-         if (getsockopt (fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0)
-           {
-             char buf[256];
+       if (getsockopt (fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0)
+         {
+           char buf[256];
  
-             dbg_log (_("error getting callers id: %s"),
-                      strerror_r (errno, buf, sizeof (buf)));
-             break;
-           }
+           dbg_log (_("error getting caller's id: %s"),
+                    strerror_r (errno, buf, sizeof (buf)));
+           break;
+         }
  
-         uid = caller.uid;
+       uid = caller.uid;
  #else
-         /* Some systems have no SO_PEERCRED implementation.  They don't
-            care about security so we don't as well.  */
-         uid = 0;
+       /* Some systems have no SO_PEERCRED implementation.  They don't
+          care about security so we don't as well.  */
+       uid = 0;
  #endif
-       }
+      }
  
        /* Accept shutdown, getstat and invalidate only from root.  For
          the stat call also allow the user specified in the config file.  */
@@ -796,7 +1331,7 @@ cannot handle old request version %d; current version is %d"),
        else if (uid == 0)
         {
           if (req->type == INVALIDATE)
-           invalidate_cache (key);
+           invalidate_cache (key, fd);
           else
             termination_handler (0);
         }
@@ -805,8 +1340,10 @@ cannot handle old request version %d; current version is %d"),
      case GETFDPW:
      case GETFDGR:
      case GETFDHST:
+    case GETFDSERV:
+    case GETFDNETGR:
  #ifdef SCM_RIGHTS
-      send_ro_fd (serv2db[req->type], key, fd);
+      send_ro_fd (reqinfo[req->type].db, key, fd);
  #endif
        break;
  
@@ -846,7 +1383,7 @@ cannot open /proc/self/cmdline: %s; disabling paranoia mode"),
        if (n == -1)
         {
           dbg_log (_("\
-cannot open /proc/self/cmdline: %s; disabling paranoia mode"),
+cannot read /proc/self/cmdline: %s; disabling paranoia mode"),
                    strerror (errno));
  
           close (fd);
@@ -883,7 +1420,7 @@ cannot open /proc/self/cmdline: %s; disabling paranoia mode"),
    /* Second, change back to the old user if we changed it.  */
    if (server_user != NULL)
      {
-      if (setuid (old_uid) != 0)
+      if (setresuid (old_uid, old_uid, old_uid) != 0)
         {
           dbg_log (_("\
  cannot change to old UID: %s; disabling paranoia mode"),
@@ -893,13 +1430,13 @@ cannot change to old UID: %s; disabling paranoia mode"),
           return;
         }
  
-      if (setgid (old_gid) != 0)
+      if (setresgid (old_gid, old_gid, old_gid) != 0)
         {
           dbg_log (_("\
  cannot change to old GID: %s; disabling paranoia mode"),
                    strerror (errno));
  
-         setuid (server_uid);
+         ignore_value (setuid (server_uid));
           paranoia = 0;
           return;
         }
@@ -914,25 +1451,43 @@ cannot change to old working directory: %s; disabling paranoia mode"),
  
        if (server_user != NULL)
         {
-         setuid (server_uid);
-         setgid (server_gid);
+         ignore_value (setuid (server_uid));
+         ignore_value (setgid (server_gid));
         }
        paranoia = 0;
        return;
      }
  
    /* Synchronize memory.  */
+  int32_t certainly[lastdb];
    for (int cnt = 0; cnt < lastdb; ++cnt)
-    {
-      /* Make sure nobody keeps using the database.  */
-      dbs[cnt].head->timestamp = 0;
+    if (dbs[cnt].enabled)
+      {
+       /* Make sure nobody keeps using the database.  */
+       dbs[cnt].head->timestamp = 0;
+       certainly[cnt] = dbs[cnt].head->nscd_certainly_running;
+       dbs[cnt].head->nscd_certainly_running = 0;
  
-      if (dbs[cnt].persistent)
-       // XXX async OK?
-       msync (dbs[cnt].head, dbs[cnt].memsize, MS_ASYNC);
-    }
+       if (dbs[cnt].persistent)
+         // XXX async OK?
+         msync (dbs[cnt].head, dbs[cnt].memsize, MS_ASYNC);
+      }
  
    /* The preparations are done.  */
+#ifdef PATH_MAX
+  char pathbuf[PATH_MAX];
+#else
+  char pathbuf[256];
+#endif
+  /* Try to exec the real nscd program so the process name (as reported
+     in /proc/PID/status) will be 'nscd', but fall back to /proc/self/exe
+     if readlink or the exec with the result of the readlink call fails.  */
+  ssize_t n = readlink ("/proc/self/exe", pathbuf, sizeof (pathbuf) - 1);
+  if (n != -1)
+    {
+      pathbuf[n] = '\0';
+      execv (pathbuf, argv);
+    }
    execv ("/proc/self/exe", argv);
  
    /* If we come here, we will never be able to re-exec.  */
@@ -941,11 +1496,22 @@ cannot change to old working directory: %s; disabling paranoia mode"),
  
    if (server_user != NULL)
      {
-      setuid (server_uid);
-      setgid (server_gid);
+      ignore_value (setuid (server_uid));
+      ignore_value (setgid (server_gid));
      }
-  chdir ("/");
+  if (chdir ("/") != 0)
+    dbg_log (_("cannot change current working directory to \"/\": %s"),
+            strerror (errno));
    paranoia = 0;
+
+  /* Reenable the databases.  */
+  time_t now = time (NULL);
+  for (int cnt = 0; cnt < lastdb; ++cnt)
+    if (dbs[cnt].enabled)
+      {
+       dbs[cnt].head->timestamp = now;
+       dbs[cnt].head->nscd_certainly_running = certainly[cnt];
+      }
  }
  
  
@@ -963,7 +1529,7 @@ static struct fdlist *readylist;
  /* Conditional variable and mutex to signal availability of entries in
     READYLIST.  The condvar is initialized dynamically since we might
     use a different clock depending on availability.  */
-static pthread_cond_t readylist_cond;
+static pthread_cond_t readylist_cond = PTHREAD_COND_INITIALIZER;
  static pthread_mutex_t readylist_lock = PTHREAD_MUTEX_INITIALIZER;
  
  /* The clock to use with the condvar.  */
@@ -973,32 +1539,112 @@ static clockid_t timeout_clock = CLOCK_REALTIME;
  static unsigned long int nready;
  
  
-/* This is the main loop.  It is replicated in different threads but the
-   `poll' call makes sure only one thread handles an incoming connection.  */
+/* Function for the clean-up threads.  */
  static void *
  __attribute__ ((__noreturn__))
-nscd_run (void *p)
+nscd_run_prune (void *p)
  {
    const long int my_number = (long int) p;
-  const int run_prune = my_number < lastdb && dbs[my_number].enabled;
+  assert (dbs[my_number].enabled);
+
+  int dont_need_update = setup_thread (&dbs[my_number]);
+
+  time_t now = time (NULL);
+
+  /* We are running.  */
+  dbs[my_number].head->timestamp = now;
+
    struct timespec prune_ts;
-  int to = 0;
-  char buf[256];
+  if (__glibc_unlikely (clock_gettime (timeout_clock, &prune_ts) == -1))
+    /* Should never happen.  */
+    abort ();
+
+  /* Compute the initial timeout time.  Prevent all the timers to go
+     off at the same time by adding a db-based value.  */
+  prune_ts.tv_sec += CACHE_PRUNE_INTERVAL + my_number;
+  dbs[my_number].wakeup_time = now + CACHE_PRUNE_INTERVAL + my_number;
+
+  pthread_mutex_t *prune_lock = &dbs[my_number].prune_lock;
+  pthread_mutex_t *prune_run_lock = &dbs[my_number].prune_run_lock;
+  pthread_cond_t *prune_cond = &dbs[my_number].prune_cond;
  
-  if (run_prune)
+  pthread_mutex_lock (prune_lock);
+  while (1)
      {
-      setup_thread (&dbs[my_number]);
+      /* Wait, but not forever.  */
+      int e = 0;
+      if (! dbs[my_number].clear_cache)
+       e = pthread_cond_timedwait (prune_cond, prune_lock, &prune_ts);
+      assert (__builtin_expect (e == 0 || e == ETIMEDOUT, 1));
+
+      time_t next_wait;
+      now = time (NULL);
+      if (e == ETIMEDOUT || now >= dbs[my_number].wakeup_time
+         || dbs[my_number].clear_cache)
+       {
+         /* We will determine the new timout values based on the
+            cache content.  Should there be concurrent additions to
+            the cache which are not accounted for in the cache
+            pruning we want to know about it.  Therefore set the
+            timeout to the maximum.  It will be descreased when adding
+            new entries to the cache, if necessary.  */
+         dbs[my_number].wakeup_time = MAX_TIMEOUT_VALUE;
+
+         /* Unconditionally reset the flag.  */
+         time_t prune_now = dbs[my_number].clear_cache ? LONG_MAX : now;
+         dbs[my_number].clear_cache = 0;
+
+         pthread_mutex_unlock (prune_lock);
+
+         /* We use a separate lock for running the prune function (instead
+            of keeping prune_lock locked) because this enables concurrent
+            invocations of cache_add which might modify the timeout value.  */
+         pthread_mutex_lock (prune_run_lock);
+         next_wait = prune_cache (&dbs[my_number], prune_now, -1);
+         pthread_mutex_unlock (prune_run_lock);
+
+         next_wait = MAX (next_wait, CACHE_PRUNE_INTERVAL);
+         /* If clients cannot determine for sure whether nscd is running
+            we need to wake up occasionally to update the timestamp.
+            Wait 90% of the update period.  */
+#define UPDATE_MAPPING_TIMEOUT (MAPPING_TIMEOUT * 9 / 10)
+         if (__glibc_unlikely (! dont_need_update))
+           {
+             next_wait = MIN (UPDATE_MAPPING_TIMEOUT, next_wait);
+             dbs[my_number].head->timestamp = now;
+           }
  
-      /* We are running.  */
-      dbs[my_number].head->timestamp = time (NULL);
+         pthread_mutex_lock (prune_lock);
+
+         /* Make it known when we will wake up again.  */
+         if (now + next_wait < dbs[my_number].wakeup_time)
+           dbs[my_number].wakeup_time = now + next_wait;
+         else
+           next_wait = dbs[my_number].wakeup_time - now;
+       }
+      else
+       /* The cache was just pruned.  Do not do it again now.  Just
+          use the new timeout value.  */
+       next_wait = dbs[my_number].wakeup_time - now;
  
        if (clock_gettime (timeout_clock, &prune_ts) == -1)
         /* Should never happen.  */
         abort ();
  
-      /* Compute timeout time.  */
-      prune_ts.tv_sec += CACHE_PRUNE_INTERVAL;
+      /* Compute next timeout time.  */
+      prune_ts.tv_sec += next_wait;
      }
+}
+
+
+/* This is the main loop.  It is replicated in different threads but
+   the use of the ready list makes sure only one thread handles an
+   incoming connection.  */
+static void *
+__attribute__ ((__noreturn__))
+nscd_run_worker (void *p)
+{
+  char buf[256];
  
    /* Initial locking.  */
    pthread_mutex_lock (&readylist_lock);
@@ -1009,26 +1655,7 @@ nscd_run (void *p)
    while (1)
      {
        while (readylist == NULL)
-       {
-         if (run_prune)
-           {
-             /* Wait, but not forever.  */
-             to = pthread_cond_timedwait (&readylist_cond, &readylist_lock,
-                                          &prune_ts);
-
-             /* If we were woken and there is no work to be done,
-                just start pruning.  */
-             if (readylist == NULL && to == ETIMEDOUT)
-               {
-                 --nready;
-                 pthread_mutex_unlock (&readylist_lock);
-                 goto only_prune;
-               }
-           }
-         else
-           /* No need to timeout.  */
-           pthread_cond_wait (&readylist_cond, &readylist_lock);
-       }
+       pthread_cond_wait (&readylist_cond, &readylist_lock);
  
        struct fdlist *it = readylist->next;
        if (readylist->next == readylist)
@@ -1048,10 +1675,15 @@ nscd_run (void *p)
        /* We are done with the list.  */
        pthread_mutex_unlock (&readylist_lock);
  
-      /* We do not want to block on a short read or so.  */
-      int fl = fcntl (fd, F_GETFL);
-      if (fl == -1 || fcntl (fd, F_SETFL, fl | O_NONBLOCK) == -1)
-       goto close_and_out;
+#ifndef __ASSUME_ACCEPT4
+      if (have_accept4 < 0)
+       {
+         /* We do not want to block on a short read or so.  */
+         int fl = fcntl (fd, F_GETFL);
+         if (fl == -1 || fcntl (fd, F_SETFL, fl | O_NONBLOCK) == -1)
+           goto close_and_out;
+       }
+#endif
  
        /* Now read the request.  */
        request_header req;
@@ -1076,25 +1708,7 @@ nscd_run (void *p)
  #ifdef SO_PEERCRED
        pid_t pid = 0;
  
-      if (secure_in_use)
-       {
-         struct ucred caller;
-         socklen_t optlen = sizeof (caller);
-
-         if (getsockopt (fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0)
-           {
-             dbg_log (_("error getting callers id: %s"),
-                      strerror_r (errno, buf, sizeof (buf)));
-             goto close_and_out;
-           }
-
-         if (req.type < GETPWBYNAME || req.type > LASTDBREQ
-             || serv2db[req.type]->secure)
-           uid = caller.uid;
-
-         pid = caller.pid;
-       }
-      else if (__builtin_expect (debug_level > 0, 0))
+      if (__glibc_unlikely (debug_level > 0))
         {
           struct ucred caller;
           socklen_t optlen = sizeof (caller);
@@ -1102,11 +1716,12 @@ nscd_run (void *p)
           if (getsockopt (fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) == 0)
             pid = caller.pid;
         }
+#else
+      const pid_t pid = 0;
  #endif
  
        /* It should not be possible to crash the nscd with a silly
          request (i.e., a terribly large key).  We limit the size to 1kb.  */
-#define MAXKEYLEN 1024
        if (__builtin_expect (req.key_len, 1) < 0
           || __builtin_expect (req.key_len, 1) > MAXKEYLEN)
         {
@@ -1116,7 +1731,7 @@ nscd_run (void *p)
        else
         {
           /* Get the key.  */
-         char keybuf[MAXKEYLEN];
+         char keybuf[MAXKEYLEN + 1];
  
           if (__builtin_expect (TEMP_FAILURE_RETRY (read (fd, keybuf,
                                                           req.key_len))
@@ -1128,6 +1743,7 @@ nscd_run (void *p)
                          strerror_r (errno, buf, sizeof (buf)));
               goto close_and_out;
             }
+         keybuf[req.key_len] = '\0';
  
           if (__builtin_expect (debug_level, 0) > 0)
             {
@@ -1143,42 +1759,20 @@ handle_request: request received (Version = %d)"), req.version);
             }
  
           /* Phew, we got all the data, now process it.  */
-         handle_request (fd, &req, keybuf, uid);
+         handle_request (fd, &req, keybuf, uid, pid);
         }
  
      close_and_out:
        /* We are done.  */
        close (fd);
  
-      /* Check whether we should be pruning the cache. */
-      assert (run_prune || to == 0);
-      if (to == ETIMEDOUT)
-       {
-       only_prune:
-         /* The pthread_cond_timedwait() call timed out.  It is time
-                to clean up the cache.  */
-         assert (my_number < lastdb);
-         prune_cache (&dbs[my_number],
-                      prune_ts.tv_sec + (prune_ts.tv_nsec >= 500000000));
-
-         if (clock_gettime (timeout_clock, &prune_ts) == -1)
-           /* Should never happen.  */
-           abort ();
-
-         /* Compute next timeout time.  */
-         prune_ts.tv_sec += CACHE_PRUNE_INTERVAL;
-
-         /* In case the list is emtpy we do not want to run the prune
-            code right away again.  */
-         to = 0;
-       }
-
        /* Re-locking.  */
        pthread_mutex_lock (&readylist_lock);
  
        /* One more thread available.  */
        ++nready;
      }
+  /* NOTREACHED */
  }
  
  
@@ -1207,7 +1801,7 @@ fd_ready (int fd)
      }
  
    bool do_signal = true;
-  if (__builtin_expect (nready == 0, 0))
+  if (__glibc_unlikely (nready == 0))
      {
        ++client_queued;
        do_signal = false;
@@ -1215,12 +1809,12 @@ fd_ready (int fd)
        /* Try to start another thread to help out.  */
        pthread_t th;
        if (nthreads < max_nthreads
-         && pthread_create (&th, &attr, nscd_run,
+         && pthread_create (&th, &attr, nscd_run_worker,
                              (void *) (long int) nthreads) == 0)
         {
           /* We got another thread.  */
           ++nthreads;
-         /* The new thread might new a kick.  */
+         /* The new thread might need a kick.  */
           do_signal = true;
         }
  
@@ -1235,7 +1829,7 @@ fd_ready (int fd)
  
  
  /* Check whether restarting should happen.  */
-static inline int
+static bool
  restart_p (time_t now)
  {
    return (paranoia && readylist == NULL && nready == nthreads
@@ -1246,6 +1840,233 @@ restart_p (time_t now)
  /* Array for times a connection was accepted.  */
  static time_t *starttime;
  
+#ifdef HAVE_INOTIFY
+/* Inotify event for changed file.  */
+union __inev
+{
+  struct inotify_event i;
+# ifndef PATH_MAX
+#  define PATH_MAX 1024
+# endif
+  char buf[sizeof (struct inotify_event) + PATH_MAX];
+};
+
+/* Returns 0 if the file is there otherwise -1.  */
+int
+check_file (struct traced_file *finfo)
+{
+  struct stat64 st;
+  /* We could check mtime and if different re-add
+     the watches, and invalidate the database, but we
+     don't because we are called from inotify_check_files
+     which should be doing that work.  If sufficient inotify
+     events were lost then the next pruning or invalidation
+     will do the stat and mtime check.  We don't do it here to
+     keep the logic simple.  */
+  if (stat64 (finfo->fname, &st) < 0)
+    return -1;
+  return 0;
+}
+
+/* Process the inotify event in INEV. If the event matches any of the files
+   registered with a database then mark that database as requiring its cache
+   to be cleared. We indicate the cache needs clearing by setting
+   TO_CLEAR[DBCNT] to true for the matching database.  */
+static void
+inotify_check_files (bool *to_clear, union __inev *inev)
+{
+  /* Check which of the files changed.  */
+  for (size_t dbcnt = 0; dbcnt < lastdb; ++dbcnt)
+    {
+      struct traced_file *finfo = dbs[dbcnt].traced_files;
+
+      while (finfo != NULL)
+       {
+         /* The configuration file was moved or deleted.
+            We stop watching it at that point, and reinitialize.  */
+         if (finfo->inotify_descr[TRACED_FILE] == inev->i.wd
+             && ((inev->i.mask & IN_MOVE_SELF)
+                 || (inev->i.mask & IN_DELETE_SELF)
+                 || (inev->i.mask & IN_IGNORED)))
+           {
+             int ret;
+             bool moved = (inev->i.mask & IN_MOVE_SELF) != 0;
+
+             if (check_file (finfo) == 0)
+               {
+                 dbg_log (_("ignored inotify event for `%s` (file exists)"),
+                          finfo->fname);
+                 return;
+               }
+
+             dbg_log (_("monitored file `%s` was %s, removing watch"),
+                      finfo->fname, moved ? "moved" : "deleted");
+             /* File was moved out, remove the watch.  Watches are
+                automatically removed when the file is deleted.  */
+             if (moved)
+               {
+                 ret = inotify_rm_watch (inotify_fd, inev->i.wd);
+                 if (ret < 0)
+                   dbg_log (_("failed to remove file watch `%s`: %s"),
+                            finfo->fname, strerror (errno));
+               }
+             finfo->inotify_descr[TRACED_FILE] = -1;
+             to_clear[dbcnt] = true;
+             if (finfo->call_res_init)
+               res_init ();
+             return;
+           }
+         /* The configuration file was open for writing and has just closed.
+            We reset the cache and reinitialize.  */
+         if (finfo->inotify_descr[TRACED_FILE] == inev->i.wd
+             && inev->i.mask & IN_CLOSE_WRITE)
+           {
+             /* Mark cache as needing to be cleared and reinitialize.  */
+             dbg_log (_("monitored file `%s` was written to"), finfo->fname);
+             to_clear[dbcnt] = true;
+             if (finfo->call_res_init)
+               res_init ();
+             return;
+           }
+         /* The parent directory was moved or deleted.  We trigger one last
+            invalidation.  At the next pruning or invalidation we may add
+            this watch back if the file is present again.  */
+         if (finfo->inotify_descr[TRACED_DIR] == inev->i.wd
+             && ((inev->i.mask & IN_DELETE_SELF)
+                 || (inev->i.mask & IN_MOVE_SELF)
+                 || (inev->i.mask & IN_IGNORED)))
+           {
+             bool moved = (inev->i.mask & IN_MOVE_SELF) != 0;
+             /* The directory watch may have already been removed
+                but we don't know so we just remove it again and
+                ignore the error.  Then we remove the file watch.
+                Note: watches are automatically removed for deleted
+                files.  */
+             if (moved)
+               inotify_rm_watch (inotify_fd, inev->i.wd);
+             if (finfo->inotify_descr[TRACED_FILE] != -1)
+               {
+                 dbg_log (_("monitored parent directory `%s` was %s, removing watch on `%s`"),
+                          finfo->dname, moved ? "moved" : "deleted", finfo->fname);
+                 if (inotify_rm_watch (inotify_fd, finfo->inotify_descr[TRACED_FILE]) < 0)
+                   dbg_log (_("failed to remove file watch `%s`: %s"),
+                            finfo->dname, strerror (errno));
+               }
+             finfo->inotify_descr[TRACED_FILE] = -1;
+             finfo->inotify_descr[TRACED_DIR] = -1;
+             to_clear[dbcnt] = true;
+             if (finfo->call_res_init)
+               res_init ();
+             /* Continue to the next entry since this might be the
+                parent directory for multiple registered files and
+                we want to remove watches for all registered files.  */
+             continue;
+           }
+         /* The parent directory had a create or moved to event.  */
+         if (finfo->inotify_descr[TRACED_DIR] == inev->i.wd
+             && ((inev->i.mask & IN_MOVED_TO)
+                 || (inev->i.mask & IN_CREATE))
+             && strcmp (inev->i.name, finfo->sfname) == 0)
+           {
+             /* We detected a directory change.  We look for the creation
+                of the file we are tracking or the move of the same file
+                into the directory.  */
+             int ret;
+             dbg_log (_("monitored file `%s` was %s, adding watch"),
+                      finfo->fname,
+                      inev->i.mask & IN_CREATE ? "created" : "moved into place");
+             /* File was moved in or created.  Regenerate the watch.  */
+             if (finfo->inotify_descr[TRACED_FILE] != -1)
+               inotify_rm_watch (inotify_fd,
+                                 finfo->inotify_descr[TRACED_FILE]);
+
+             ret = inotify_add_watch (inotify_fd,
+                                      finfo->fname,
+                                      TRACED_FILE_MASK);
+             if (ret < 0)
+               dbg_log (_("failed to add file watch `%s`: %s"),
+                        finfo->fname, strerror (errno));
+
+             finfo->inotify_descr[TRACED_FILE] = ret;
+
+             /* The file is new or moved so mark cache as needing to
+                be cleared and reinitialize.  */
+             to_clear[dbcnt] = true;
+             if (finfo->call_res_init)
+               res_init ();
+
+             /* Done re-adding the watch.  Don't return, we may still
+                have other files in this same directory, same watch
+                descriptor, and need to process them.  */
+           }
+         /* Other events are ignored, and we move on to the next file.  */
+         finfo = finfo->next;
+        }
+    }
+}
+
+/* If an entry in the array of booleans TO_CLEAR is TRUE then clear the cache
+   for the associated database, otherwise do nothing. The TO_CLEAR array must
+   have LASTDB entries.  */
+static inline void
+clear_db_cache (bool *to_clear)
+{
+  for (size_t dbcnt = 0; dbcnt < lastdb; ++dbcnt)
+    if (to_clear[dbcnt])
+      {
+       pthread_mutex_lock (&dbs[dbcnt].prune_lock);
+       dbs[dbcnt].clear_cache = 1;
+       pthread_mutex_unlock (&dbs[dbcnt].prune_lock);
+       pthread_cond_signal (&dbs[dbcnt].prune_cond);
+      }
+}
+
+int
+handle_inotify_events (void)
+{
+  bool to_clear[lastdb] = { false, };
+  union __inev inev;
+
+  /* Read all inotify events for files registered via
+     register_traced_file().  */
+  while (1)
+    {
+      /* Potentially read multiple events into buf.  */
+      ssize_t nb = TEMP_FAILURE_RETRY (read (inotify_fd,
+                                            &inev.buf,
+                                            sizeof (inev)));
+      if (nb < (ssize_t) sizeof (struct inotify_event))
+       {
+         /* Not even 1 event.  */
+         if (__glibc_unlikely (nb == -1 && errno != EAGAIN))
+           return -1;
+         /* Done reading events that are ready.  */
+         break;
+       }
+      /* Process all events.  The normal inotify interface delivers
+        complete events on a read and never a partial event.  */
+      char *eptr = &inev.buf[0];
+      ssize_t count;
+      while (1)
+       {
+         /* Check which of the files changed.  */
+         inotify_check_files (to_clear, &inev);
+         count = sizeof (struct inotify_event) + inev.i.len;
+         eptr += count;
+         nb -= count;
+         if (nb >= (ssize_t) sizeof (struct inotify_event))
+           memcpy (&inev, eptr, nb);
+         else
+           break;
+       }
+      continue;
+    }
+  /* Actually perform the cache clearing.  */
+  clear_db_cache (to_clear);
+  return 0;
+}
+
+#endif
  
  static void
  __attribute__ ((__noreturn__))
@@ -1259,6 +2080,28 @@ main_loop_poll (void)
    size_t nused = 1;
    size_t firstfree = 1;
  
+#ifdef HAVE_INOTIFY
+  if (inotify_fd != -1)
+    {
+      conns[1].fd = inotify_fd;
+      conns[1].events = POLLRDNORM;
+      nused = 2;
+      firstfree = 2;
+    }
+#endif
+
+#ifdef HAVE_NETLINK
+  size_t idx_nl_status_fd = 0;
+  if (nl_status_fd != -1)
+    {
+      idx_nl_status_fd = nused;
+      conns[nused].fd = nl_status_fd;
+      conns[nused].events = POLLRDNORM;
+      ++nused;
+      firstfree = nused;
+    }
+#endif
+
    while (1)
      {
        /* Wait for any event.  We wait at most a couple of seconds so
@@ -1281,26 +2124,90 @@ main_loop_poll (void)
           if (conns[0].revents != 0)
             {
               /* We have a new incoming connection.  Accept the connection.  */
-             int fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+             int fd;
  
-             /* use the descriptor if we have not reached the limit.  */
-             if (fd >= 0 && firstfree < nconns)
+#ifndef __ASSUME_ACCEPT4
+             fd = -1;
+             if (have_accept4 >= 0)
+#endif
                 {
-                 conns[firstfree].fd = fd;
-                 conns[firstfree].events = POLLRDNORM;
-                 starttime[firstfree] = now;
-                 if (firstfree >= nused)
-                   nused = firstfree + 1;
+                 fd = TEMP_FAILURE_RETRY (accept4 (sock, NULL, NULL,
+                                                   SOCK_NONBLOCK));
+#ifndef __ASSUME_ACCEPT4
+                 if (have_accept4 == 0)
+                   have_accept4 = fd != -1 || errno != ENOSYS ? 1 : -1;
+#endif
+               }
+#ifndef __ASSUME_ACCEPT4
+             if (have_accept4 < 0)
+               fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+#endif
  
-                 do
-                   ++firstfree;
-                 while (firstfree < nused && conns[firstfree].fd != -1);
+             /* Use the descriptor if we have not reached the limit.  */
+             if (fd >= 0)
+               {
+                 if (firstfree < nconns)
+                   {
+                     conns[firstfree].fd = fd;
+                     conns[firstfree].events = POLLRDNORM;
+                     starttime[firstfree] = now;
+                     if (firstfree >= nused)
+                       nused = firstfree + 1;
+
+                     do
+                       ++firstfree;
+                     while (firstfree < nused && conns[firstfree].fd != -1);
+                   }
+                 else
+                   /* We cannot use the connection so close it.  */
+                   close (fd);
                 }
  
               --n;
             }
  
-         for (size_t cnt = 1; cnt < nused && n > 0; ++cnt)
+         size_t first = 1;
+#ifdef HAVE_INOTIFY
+         if (inotify_fd != -1 && conns[1].fd == inotify_fd)
+           {
+             if (conns[1].revents != 0)
+               {
+                 int ret;
+                 ret = handle_inotify_events ();
+                 if (ret == -1)
+                   {
+                     /* Something went wrong when reading the inotify
+                        data.  Better disable inotify.  */
+                     dbg_log (_("disabled inotify-based monitoring after read error %d"), errno);
+                     conns[1].fd = -1;
+                     firstfree = 1;
+                     if (nused == 2)
+                       nused = 1;
+                     close (inotify_fd);
+                     inotify_fd = -1;
+                   }
+                 --n;
+               }
+
+             first = 2;
+           }
+#endif
+
+#ifdef HAVE_NETLINK
+         if (idx_nl_status_fd != 0 && conns[idx_nl_status_fd].revents != 0)
+           {
+             char buf[4096];
+             /* Read all the data.  We do not interpret it here.  */
+             while (TEMP_FAILURE_RETRY (read (nl_status_fd, buf,
+                                              sizeof (buf))) != -1)
+               ;
+
+             dbs[hstdb].head->extra_data[NSCD_HST_IDX_CONF_TIMESTAMP]
+               = __bump_nl_timestamp ();
+           }
+#endif
+
+         for (size_t cnt = first; cnt < nused && n > 0; ++cnt)
             if (conns[cnt].revents != 0)
               {
                 fd_ready (conns[cnt].fd);
@@ -1366,6 +2273,29 @@ main_loop_epoll (int efd)
      /* We cannot use epoll.  */
      return;
  
+# ifdef HAVE_INOTIFY
+  if (inotify_fd != -1)
+    {
+      ev.events = EPOLLRDNORM;
+      ev.data.fd = inotify_fd;
+      if (epoll_ctl (efd, EPOLL_CTL_ADD, inotify_fd, &ev) == -1)
+       /* We cannot use epoll.  */
+       return;
+      nused = 2;
+    }
+# endif
+
+# ifdef HAVE_NETLINK
+  if (nl_status_fd != -1)
+    {
+      ev.events = EPOLLRDNORM;
+      ev.data.fd = nl_status_fd;
+      if (epoll_ctl (efd, EPOLL_CTL_ADD, nl_status_fd, &ev) == -1)
+       /* We cannot use epoll.  */
+       return;
+    }
+# endif
+
    while (1)
      {
        struct epoll_event revs[100];
@@ -1379,8 +2309,26 @@ main_loop_epoll (int efd)
         if (revs[cnt].data.fd == sock)
           {
             /* A new connection.  */
-           int fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+           int fd;
+
+# ifndef __ASSUME_ACCEPT4
+           fd = -1;
+           if (have_accept4 >= 0)
+# endif
+             {
+               fd = TEMP_FAILURE_RETRY (accept4 (sock, NULL, NULL,
+                                                 SOCK_NONBLOCK));
+# ifndef __ASSUME_ACCEPT4
+               if (have_accept4 == 0)
+                 have_accept4 = fd != -1 || errno != ENOSYS ? 1 : -1;
+# endif
+             }
+# ifndef __ASSUME_ACCEPT4
+           if (have_accept4 < 0)
+             fd = TEMP_FAILURE_RETRY (accept (sock, NULL, NULL));
+# endif
  
+           /* Use the descriptor if we have not reached the limit.  */
             if (fd >= 0)
               {
                 /* Try to add the  new descriptor.  */
@@ -1402,13 +2350,41 @@ main_loop_epoll (int efd)
                   }
               }
           }
+# ifdef HAVE_INOTIFY
+       else if (revs[cnt].data.fd == inotify_fd)
+         {
+           int ret;
+           ret = handle_inotify_events ();
+           if (ret == -1)
+             {
+               /* Something went wrong when reading the inotify
+                  data.  Better disable inotify.  */
+               dbg_log (_("disabled inotify-based monitoring after read error %d"), errno);
+               (void) epoll_ctl (efd, EPOLL_CTL_DEL, inotify_fd, NULL);
+               close (inotify_fd);
+               inotify_fd = -1;
+               break;
+             }
+         }
+# endif
+# ifdef HAVE_NETLINK
+       else if (revs[cnt].data.fd == nl_status_fd)
+         {
+           char buf[4096];
+           /* Read all the data.  We do not interpret it here.  */
+           while (TEMP_FAILURE_RETRY (read (nl_status_fd, buf,
+                                            sizeof (buf))) != -1)
+             ;
+
+           __bump_nl_timestamp ();
+         }
+# endif
         else
           {
             /* Remove the descriptor from the epoll descriptor.  */
-           struct epoll_event ev = { 0, };
-           (void) epoll_ctl (efd, EPOLL_CTL_DEL, revs[cnt].data.fd, &ev);
+           (void) epoll_ctl (efd, EPOLL_CTL_DEL, revs[cnt].data.fd, NULL);
  
-           /* Get a worked to handle the request.  */
+           /* Get a worker to handle the request.  */
             fd_ready (revs[cnt].data.fd);
  
             /* Reset the time.  */
@@ -1424,12 +2400,16 @@ main_loop_epoll (int efd)
        /*  Now look for descriptors for accepted connections which have
           no reply in too long of a time.  */
        time_t laststart = now - ACCEPT_TIMEOUT;
+      assert (starttime[sock] == 0);
+# ifdef HAVE_INOTIFY
+      assert (inotify_fd == -1 || starttime[inotify_fd] == 0);
+# endif
+      assert (nl_status_fd == -1 || starttime[nl_status_fd] == 0);
        for (int cnt = highest; cnt > STDERR_FILENO; --cnt)
-       if (cnt != sock && starttime[cnt] != 0 && starttime[cnt] < laststart)
+       if (starttime[cnt] != 0 && starttime[cnt] < laststart)
           {
             /* We are waiting for this one for too long.  Close it.  */
-           struct epoll_event ev = {0, };
-           (void) epoll_ctl (efd, EPOLL_CTL_DEL, cnt, &ev);
+           (void) epoll_ctl (efd, EPOLL_CTL_DEL, cnt, NULL);
  
             (void) close (cnt);
  
@@ -1471,36 +2451,63 @@ start_threads (void)
         timeout_clock = CLOCK_MONOTONIC;
  #endif
  
-  pthread_cond_init (&readylist_cond, &condattr);
-  pthread_condattr_destroy (&condattr);
-
-
    /* Create the attribute for the threads.  They are all created
       detached.  */
    pthread_attr_init (&attr);
    pthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED);
    /* Use 1MB stacks, twice as much for 64-bit architectures.  */
-  pthread_attr_setstacksize (&attr, 1024 * 1024 * (sizeof (void *) / 4));
+  pthread_attr_setstacksize (&attr, NSCD_THREAD_STACKSIZE);
  
    /* We allow less than LASTDB threads only for debugging.  */
    if (debug_level == 0)
      nthreads = MAX (nthreads, lastdb);
  
-  int nfailed = 0;
-  for (long int i = 0; i < nthreads; ++i)
+  /* Create the threads which prune the databases.  */
+  // XXX Ideally this work would be done by some of the worker threads.
+  // XXX But this is problematic since we would need to be able to wake
+  // XXX them up explicitly as well as part of the group handling the
+  // XXX ready-list.  This requires an operation where we can wait on
+  // XXX two conditional variables at the same time.  This operation
+  // XXX does not exist (yet).
+  for (long int i = 0; i < lastdb; ++i)
      {
+      /* Initialize the conditional variable.  */
+      if (pthread_cond_init (&dbs[i].prune_cond, &condattr) != 0)
+       {
+         dbg_log (_("could not initialize conditional variable"));
+         do_exit (1, 0, NULL);
+       }
+
        pthread_t th;
-      if (pthread_create (&th, &attr, nscd_run, (void *) (i - nfailed)) != 0)
-       ++nfailed;
+      if (dbs[i].enabled
+         && pthread_create (&th, &attr, nscd_run_prune, (void *) i) != 0)
+       {
+         dbg_log (_("could not start clean-up thread; terminating"));
+         do_exit (1, 0, NULL);
+       }
      }
-  if (nthreads - nfailed < lastdb)
+
+  pthread_condattr_destroy (&condattr);
+
+  for (long int i = 0; i < nthreads; ++i)
      {
-      /* We could not start enough threads.  */
-      dbg_log (_("could only start %d threads; terminating"),
-              nthreads - nfailed);
-      exit (1);
+      pthread_t th;
+      if (pthread_create (&th, &attr, nscd_run_worker, NULL) != 0)
+       {
+         if (i == 0)
+           {
+             dbg_log (_("could not start any worker thread; terminating"));
+             do_exit (1, 0, NULL);
+           }
+
+         break;
+       }
      }
  
+  /* Now it is safe to let the parent know that we're doing fine and it can
+     exit.  */
+  notify_parent (0);
+
    /* Determine how much room for descriptors we should initially
       allocate.  This might need to change later if we cap the number
       with MAXCONN.  */
@@ -1545,8 +2552,8 @@ begin_drop_privileges (void)
    if (pwd == NULL)
      {
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      error (EXIT_FAILURE, 0, _("Failed to run nscd as user '%s'"),
-            server_user);
+      do_exit (EXIT_FAILURE, 0,
+              _("Failed to run nscd as user '%s'"), server_user);
      }
  
    server_uid = pwd->pw_uid;
@@ -1563,7 +2570,8 @@ begin_drop_privileges (void)
      {
        /* This really must never happen.  */
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      error (EXIT_FAILURE, errno, _("initial getgrouplist failed"));
+      do_exit (EXIT_FAILURE, errno,
+              _("initial getgrouplist failed"));
      }
  
    server_groups = (gid_t *) xmalloc (server_ngroups * sizeof (gid_t));
@@ -1572,7 +2580,7 @@ begin_drop_privileges (void)
        == -1)
      {
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      error (EXIT_FAILURE, errno, _("getgrouplist failed"));
+      do_exit (EXIT_FAILURE, errno, _("getgrouplist failed"));
      }
  }
  
@@ -1582,23 +2590,40 @@ begin_drop_privileges (void)
  static void
  finish_drop_privileges (void)
  {
+#if defined HAVE_LIBAUDIT && defined HAVE_LIBCAP
+  /* We need to preserve the capabilities to connect to the audit daemon.  */
+  cap_t new_caps = preserve_capabilities ();
+#endif
+
    if (setgroups (server_ngroups, server_groups) == -1)
      {
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      error (EXIT_FAILURE, errno, _("setgroups failed"));
+      do_exit (EXIT_FAILURE, errno, _("setgroups failed"));
      }
  
-  if (setgid (server_gid) == -1)
+  int res;
+  if (paranoia)
+    res = setresgid (server_gid, server_gid, old_gid);
+  else
+    res = setgid (server_gid);
+  if (res == -1)
      {
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      perror ("setgid");
-      exit (1);
+      do_exit (4, errno, "setgid");
      }
  
-  if (setuid (server_uid) == -1)
+  if (paranoia)
+    res = setresuid (server_uid, server_uid, old_uid);
+  else
+    res = setuid (server_uid);
+  if (res == -1)
      {
        dbg_log (_("Failed to run nscd as user '%s'"), server_user);
-      perror ("setuid");
-      exit (1);
+      do_exit (4, errno, "setuid");
      }
+
+#if defined HAVE_LIBAUDIT && defined HAVE_LIBCAP
+  /* Remove the temporary capabilities.  */
+  install_real_capabilities (new_caps);
+#endif
  }