upgrade to 0.52 (closes #795618)

[unscd.git] / nscd.c
diff --git a/nscd.c b/nscd.c

index 3037240a811c84fe52248d46679111295af4c6dc..93db75fa4e80df1c46f7129b66913373c70367b5 100644 (file)
--- a/nscd.c
+++ b/nscd.c
@@ -1,5 +1,6 @@
  /* This file is part of unscd, a complete nscd replacement.
- * Copyright (C) 2007 Denys Vlasenko. Licensed under the GPL version 2. */
+ * Copyright (C) 2007-2012 Denys Vlasenko. Licensed under the GPL version 2.
+ */
  
  /* unscd is free software; you can redistribute it and/or modify
   * it under the terms of the GNU General Public License as published by
@@ -17,7 +18,7 @@
  /*
  Build instructions:
  
-gcc -Os -o nscd nscd.c
+gcc -Wall -Wunused-parameter -Os -o nscd nscd.c
  
  gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
        -Os -o nscd nscd.c
@@ -29,7 +30,7 @@ quite a bit of bugs in it. This leads people to invent babysitters
  which restart crashed/hung nscd. This is ugly.
  
  After looking at nscd source in glibc I arrived to the conclusion
-that its desidn is contributing to this significantly. Even if nscd's
+that its design is contributing to this significantly. Even if nscd's
  code is 100.00% perfect and bug-free, it can still suffer from bugs
  in libraries it calls.
  
@@ -42,7 +43,7 @@ or file descriptor leaks and other bugs.
  
  Since nscd is multithreaded program with single shared cache,
  any resource leak in any NSS library has cumulative effect.
-Even if an NSS library leaks a file descriptor 0.01% of the time,
+Even if a NSS library leaks a file descriptor 0.01% of the time,
  this will make nscd crash or hang after some time.
  
  Of course bugs in NSS .so modules should be fixed, but meanwhile
@@ -120,8 +121,32 @@ vda.linux@googlemail.com
   * 0.36 excercise extreme paranoia handling server-user option;
   *      a little bit more verbose logging:
   *      L_DEBUG2 log level added, use debug-level 7 to get it
+ * 0.37 users reported over-zealous "detected change in /etc/passwd",
+ *      apparently stat() returns random garbage in unused padding
+ *      on some systems. Made the check less paranoid.
+ * 0.38 log POLLHUP better
+ * 0.39 log answers to client better, log getpwnam in the worker,
+ *      pass debug level value down to worker.
+ * 0.40   fix handling of shutdown and invalidate requests;
+ *        fix bug with answer written in several pieces
+ * 0.40.1 set hints.ai_socktype = SOCK_STREAM in GETAI request
+ * 0.41   eliminate double caching of two near-simultaneous identical requests -
+ *        EXPERIMENTAL
+ * 0.42   execute /proc/self/exe by link name first (better comm field)
+ * 0.43   fix off-by-one error in setgroups
+ * 0.44   make -d[ddd] bump up debug - easier to explain to users
+ *        how to produce detailed log (no nscd.conf tweaking)
+ * 0.45   Fix out-of-bounds array access and log/pid file permissions -
+ *        thanks to Sebastian Krahmer (krahmer AT suse.de)
+ * 0.46   fix a case when we forgot to remove a future entry on worker failure
+ * 0.47   fix nscd without -d to not bump debug level
+ * 0.48   fix for changes in __nss_disable_nscd API in glibc-2.15
+ * 0.49   minor tweaks to messages
+ * 0.50   add more files to watch for changes
+ * 0.51   fix a case where we forget to refcount-- the cached entry
+ * 0.52   make free_refcounted_ureq() tolerant to pointers to NULLs
   */
-#define PROGRAM_VERSION "0.36"
+#define PROGRAM_VERSION "0.52"
  
  #define DEBUG_BUILD 1
  
@@ -130,6 +155,8 @@ vda.linux@googlemail.com
  ** Generic helpers
  */
  
+#define ARRAY_SIZE(x) ((unsigned)(sizeof(x) / sizeof((x)[0])))
+
  #define NORETURN __attribute__ ((__noreturn__))
  
  
@@ -473,6 +500,12 @@ enum {
         SRV_HOSTS,
  };
  
+static const char srv_name[3][7] = {
+       "passwd",
+       "group",
+       "hosts"
+};
+
  static struct {
         const char *logfile;
         const char *user;
@@ -503,9 +536,9 @@ static const char *self_exe_points_to = "/proc/self/exe";
  
  /* Header common to all requests */
  #define USER_REQ_STRUCT \
-       int32_t version; /* Version number of the daemon interface */ \
-       int32_t type;    /* Service requested */ \
-       int32_t key_len; /* Key length */
+       uint32_t version; /* Version number of the daemon interface */ \
+       uint32_t type;    /* Service requested */ \
+       uint32_t key_len; /* Key length */
  
  typedef struct user_req_header {
         USER_REQ_STRUCT
@@ -528,7 +561,7 @@ typedef struct user_req {
                         USER_REQ_STRUCT
                 };
                 struct { /* when stored in cache, overlaps .version */
-                       unsigned refcount:8; /* actually, can be 1 or 0 only */
+                       unsigned refcount:8;
                         /* (timestamp24 * 256) == timestamp in ms */
                         unsigned timestamp24:24;
                 };
@@ -607,16 +640,12 @@ static int unsupported_ureq_type(unsigned type)
  {
         if (type == GETAI) return 0;
         if (type == INITGROUPS) return 0;
-       if (type > GETHOSTBYADDRv6) return 1;
+       if (type == GETSTAT) return 1;
+       if (type > INVALIDATE) return 1;
         return 0;
  }
  
-/* Possible reductions:
- * fd, bufidx - uint8_t
- * started_ms -> uint16_t started_s
- * ureq - eliminate (derivable from bufidx?)
- * cell - eliminate (derivable from resptr?)
- */
+
  typedef struct client_info {
         /* if client_fd != 0, we are waiting for the reply from worker
          * on pfd[i].fd, and client_fd is saved client's fd
@@ -626,12 +655,12 @@ typedef struct client_info {
         unsigned bufidx;        /* buffer# in global client_buf[] */
         unsigned started_ms;
         unsigned respos;        /* response */
-       //unsigned resp_sz;
         user_req *resptr;       /* response */
+       user_req **cache_pp;    /* cache entry address */
         user_req *ureq;         /* request (points to client_buf[x]) */
-       user_req **cell;        /* cache cell ptr */
  } client_info;
  
+static unsigned g_now_ms;
  static int min_closed = INT_MAX;
  static int cnt_closed = 0;
  static int num_clients = 2; /* two listening sockets are "clients" too */
@@ -639,25 +668,95 @@ static int num_clients = 2; /* two listening sockets are "clients" too */
  /* We read up to max_reqnum requests in parallel */
  static unsigned max_reqnum = 14;
  static int next_buf;
-/* Each of these points to [max_reqnum] sized array */
+/* To be allocated at init to become client_buf[max_reqnum][MAX_USER_REQ_SIZE].
+ * Note: it is a pointer to [MAX_USER_REQ_SIZE] arrays,
+ * not [MAX_USER_REQ_SIZE] array of pointers.
+ */
  static char          (*client_buf)[MAX_USER_REQ_SIZE];
  static char          *busy_cbuf;
  static struct pollfd *pfd;
  static client_info   *cinfo;
  
+/* Request, response and cache data structures:
+ *
+ * cache[] (defined later):
+ * cacheline_t cache[cache_size] array, or in other words,
+ * user_req* cache[cache_size][8] array.
+ * Every client request is hashed, hash value determines which cache[x]
+ * will have the response stored in one of its 8 elements.
+ * Cache entries have this format: request, then padding to 32 bits,
+ * then the response.
+ * Addresses in cache[x][y] may be NULL or:
+ * (&client_buf[z]) & 1: the cache miss is in progress ("future entry"):
+ * "the data is not in the cache (yet), wait for it to appear"
+ * (&client_buf[z]) & 3: the cache miss is in progress and other clients
+ * also want the same data ("shared future entry")
+ * else (non-NULL but low two bits are 0): cached data in malloc'ed block
+ *
+ * Each of these is a [max_reqnum] sized array:
+ * pfd[i] - given to poll() to wait for requests and replies.
+ *      .fd: first two pfd[i]: listening Unix domain sockets, else
+ *      .fd: open fd to a client, for reading client's request, or
+ *      .fd: open fd to a worker, to send request and get response back
+ * cinfo[i] - auxiliary client data for pfd[i]
+ *      .client_fd: open fd to a client, in case we already had read its
+ *          request and got a cache miss, and created a worker or
+ *          wait for another client's worker.
+ *          Otherwise, it's 0 and client's fd is in pfd[i].fd
+ *      .bufidx: index in client_buf[] we store client's request in
+ *      .ureq: = client_buf[bufidx]
+ *      .bytecnt: size of the request
+ *      .started_ms: used to time out unresponsive clients
+ *      .resptr: initially NULL. Later, same as cache[x][y] pointer to a cached
+ *          response, or (a rare case) a "fake cache" entry:
+ *          all cache[hash(request)][0..7] blocks were found busy,
+ *          the result won't be cached.
+ *      .respos: "write-out to client" offset
+ *      .cache_pp: initially NULL. Later, &cache[x][y] where the response is,
+ *          or will be stored. Remains NULL if "fake cache" entry is in use
+ *
+ * When a client has received its reply (or otherwise closed (timeout etc)),
+ * corresponding pfd[i] and cinfo[i] are removed by shifting [i+1], [i+2] etc
+ * elements down, so that both arrays never have free holes.
+ * [num_clients] is always the first free element.
+ *
+ * Each of these also is a [max_reqnum] sized array, but indexes
+ * do not correspond directly to pfd[i] and cinfo[i]:
+ * client_buf[n][MAX_USER_REQ_SIZE] - buffers we read client requests into
+ * busy_cbuf[n] - bool flags marking busy client_buf[]
+ */
+/* Possible reductions:
+ * fd, bufidx - uint8_t
+ * started_ms -> uint16_t started_s
+ * ureq - eliminate (derivable from bufidx?)
+ */
+
+/* Are special bits 0? is it a true cached entry? */
+#define CACHED_ENTRY(p)     ( ((long)(p) & 3) == 0 )
+/* Are special bits 11? is it a shared future cache entry? */
+#define CACHE_SHARED(p)     ( ((long)(p) & 3) == 3 )
+/* Return a ptr with special bits cleared (used for accessing data) */
+#define CACHE_PTR(p)        ( (void*) ((long)(p) & ~(long)3) )
+/* Return a ptr with special bits set to x1: make future cache entry ptr */
+#define MAKE_FUTURE_PTR(p)  ( (void*) ((long)(p) | 1) )
+/* Modify ptr, set special bits to 11: shared future cache entry */
+#define MARK_PTR_SHARED(pp) ( *(long*)(pp) |= 3 )
+
  static inline unsigned ureq_size(const user_req *ureq)
  {
         return sizeof(user_req_header) + ureq->key_len;
  }
  
-static unsigned cache_age(unsigned now_ms, const user_req *ureq)
+static unsigned cache_age(const user_req *ureq)
  {
-       return (uint32_t)now_ms - (ureq->timestamp24 << 8);
+       if (!CACHED_ENTRY(ureq))
+               return 0;
+       return (uint32_t) (g_now_ms - (ureq->timestamp24 << 8));
  }
  
-static void set_cache_timestamp(user_req *ureq, unsigned now_ms)
+static void set_cache_timestamp(user_req *ureq)
  {
-       ureq->timestamp24 = now_ms >> 8;
+       ureq->timestamp24 = g_now_ms >> 8;
  }
  
  static int alloc_buf_no(void)
@@ -679,12 +778,36 @@ static inline void *bufno2buf(int i)
         return client_buf[i];
  }
  
-static void close_client(int i)
+static void free_refcounted_ureq(user_req **ureqp);
+
+static void close_client(unsigned i)
  {
-       log(L_DEBUG, "closing client %d (fd %d)", i, pfd[i].fd);
+       log(L_DEBUG, "closing client %u (fd %u,%u)", i, pfd[i].fd, cinfo[i].client_fd);
+       /* Paranoia. We had nasty bugs where client was closed twice. */
+       if (pfd[i].fd == 0)
+               return;
+
         close(pfd[i].fd);
-       pfd[i].fd = 0; /* flag as unused */
+       if (cinfo[i].client_fd && cinfo[i].client_fd != pfd[i].fd)
+               close(cinfo[i].client_fd);
+       pfd[i].fd = 0; /* flag as unused (coalescing needs this) */
         busy_cbuf[cinfo[i].bufidx] = 0;
+
+       if (cinfo[i].cache_pp == NULL) {
+               user_req *resptr = cinfo[i].resptr;
+               if (resptr) {
+                       log(L_DEBUG, "client %u: freeing fake cache entry %p", i, resptr);
+                       free(resptr);
+               }
+       } else {
+               /* Most of the time, it is not freed here,
+                * only refcounted--. Freeing happens
+                * if it was deleted from cache[] but retained
+                * for writeout.
+                */
+               free_refcounted_ureq(&cinfo[i].resptr);
+       }
+
         cnt_closed++;
         if (i < min_closed)
                 min_closed = i;
@@ -692,7 +815,7 @@ static void close_client(int i)
  
  
  /*
-** ncsd API <-> C API conversion
+** nscd API <-> C API conversion
  */
  
  typedef struct response_header {
@@ -995,7 +1118,9 @@ static ai_response_header *obtain_addrinfo(const char *hostname)
  
         memset(&hints, 0, sizeof(hints));
         hints.ai_flags = AI_CANONNAME;
-       /* hinst.ai_socktype = SOCK_STREAM; - can kill dups (one for each possible SOCK_xxx) */
+       /* kills dups (one for each possible SOCK_xxx) */
+       /* this matches glibc behavior */
+       hints.ai_socktype = SOCK_STREAM;
         ai = NULL; /* on failure getaddrinfo may leave it as-is */
         err = getaddrinfo(hostname, NULL, &hints, &ai);
  
@@ -1085,108 +1210,123 @@ static uint32_t bernstein_hash(void *p, unsigned sz, uint32_t hash)
         return hash;
  }
  
-static user_req *find_cell_and_response(user_req ***cellp, user_req *ureq)
+static void free_refcounted_ureq(user_req **ureqp)
+{
+       user_req *ureq = *ureqp;
+
+       /* (when exactly can this happen?) */
+       if (ureq == NULL)
+               return;
+
+       if (!CACHED_ENTRY(ureq))
+               return;
+
+       if (ureq->refcount) {
+               ureq->refcount--;
+               log(L_DEBUG2, "--%p.refcount=%u", ureq, ureq->refcount);
+       } else {
+               log(L_DEBUG2, "%p.refcount=0, freeing", ureq);
+               free(ureq);
+       }
+       *ureqp = NULL;
+}
+
+static user_req **lookup_in_cache(user_req *ureq)
  {
-       user_req **cell;
+       user_req **cacheline;
+       int free_cache;
         unsigned hash;
         unsigned i;
         unsigned ureq_sz = ureq_size(ureq);
  
         /* prevent overflow and division by zero */
-       if ((int)(cache_access_cnt+1) < 0) {
+       cache_access_cnt++;
+       if ((int)cache_access_cnt < 0) {
                 cache_access_cnt = (cache_access_cnt >> 1) + 1;
                 cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
         }
-       cache_access_cnt++;
  
         hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
-       log(L_DEBUG, "hash:%08x", hash);
+       log(L_DEBUG2, "hash:%08x", hash);
         hash = hash % cache_size;
-       (*cellp) = cell = cache[hash];
+       cacheline = cache[hash];
  
+       free_cache = -1;
         for (i = 0; i < 8; i++) {
-               if (!cell[i])
+               user_req *cached = CACHE_PTR(cacheline[i]);
+               if (!cached) {
+                       if (free_cache == -1)
+                               free_cache = i;
                         continue;
-// TODO: do secondary hash match
+               }
                 /* ureq->version is always 2 and is reused in cache
                  * for other purposes, we need to skip it here */
-               if (memcmp(&ureq->type, &cell[i]->type, ureq_sz - offsetof(user_req, type)) == 0) {
+               if (memcmp(&ureq->type, &cached->type, ureq_sz - offsetof(user_req, type)) == 0) {
                         log(L_DEBUG, "found in cache[%u][%u]", hash, i);
                         cache_hit_cnt++;
-                       return cell[i];
+                       return &cacheline[i];
                 }
         }
-       log(L_DEBUG, "not found in cache[%u][x]", hash);
-       return NULL;
-}
  
-static void free_refcounted_ureq(user_req **ureqp)
-{
-       user_req *ureq = *ureqp;
-       /* is it in use? */
-       if (ureq->refcount) {
-               ureq->refcount = 0; /* since it can be only 1 or 0... */
-       } else {
-               free(ureq);
+       if (free_cache >= 0) {
+               cached_cnt++;
+               i = free_cache;
+               log(L_DEBUG, "not found, using free cache[%u][%u]", hash, i);
+               goto ret;
         }
-       *ureqp = NULL;
-}
  
-static void save_in_cell(user_req **cell, user_req *new_cached, unsigned now_ms)
-{
         unsigned oldest_idx = 0;
         unsigned oldest_age = 0;
-       unsigned age;
-       unsigned i;
-
         for (i = 0; i < 8; i++) {
-               if (!cell[i]) {
-                       log(L_DEBUG, "using free cache[x][%u]", i);
-                       cached_cnt++;
-                       cell[i] = new_cached;
-                       aging_interval_ms = min_aging_interval_ms;
-                       return;
-               }
-               age = cache_age(now_ms, cell[i]);
+               unsigned age = cache_age(cacheline[i]);
                 if (age > oldest_age) {
                         oldest_age = age;
                         oldest_idx = i;
                 }
         }
-       log(L_DEBUG, "freeing and reusing cache[x][%u] (age %u)", oldest_idx, oldest_age);
-       if (cell[oldest_idx]) {
-               free_refcounted_ureq(&cell[oldest_idx]);
-       } else {
-               cached_cnt++;
+       if (oldest_age == 0) {
+               /* All entries in cacheline are "future" entries!
+                * This is very unlikely, but we must still work correctly.
+                * We call this "fake cache entry".
+                * The data will be "cached" only for the duration
+                * of this client's request lifetime.
+                */
+               log(L_DEBUG, "not found, and cache[%u] is full: using fake cache entry", hash);
+               return NULL;
         }
-       cell[oldest_idx] = new_cached;
-       aging_interval_ms = min_aging_interval_ms;
+       i = oldest_idx;
+       log(L_DEBUG, "not found, freeing and reusing cache[%u][%u] (age %u)", hash, i, oldest_age);
+       free_refcounted_ureq(&cacheline[i]);
+
+ ret:
+       cacheline[i] = MAKE_FUTURE_PTR(ureq);
+       return &cacheline[i];
  }
  
-static void age_cache(unsigned now_ms, int srv)
+static void age_cache(unsigned free_all, int srv)
  {
         user_req **cp = *cache;
         int i;
         unsigned sv = cached_cnt;
  
-       log(L_DEBUG, "aging cache, srv:%d, now:%u", srv, now_ms);
-       if (srv == -1 || !now_ms)
+       log(L_DEBUG, "aging cache, srv:%d, free_all:%u", srv, free_all);
+       if (srv == -1 || free_all)
                 aging_interval_ms = INT_MAX;
         i = cache_size * 8;
         do {
                 user_req *cached = *cp;
-               if (cached) {
+               if (CACHED_ENTRY(cached) && cached != NULL) {
                         int csrv = type_to_srv[cached->type];
                         if (srv == -1 || srv == csrv) {
-                               if (!now_ms) {
+                               if (free_all) {
                                         cached_cnt--;
                                         free_refcounted_ureq(cp);
                                 } else {
-                                       unsigned age = cache_age(now_ms, cached);
+                                       unsigned age = cache_age(cached);
                                         response_header *resp = ureq_response(cached);
                                         unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
                                         if (age >= ttl) {
-                                               log(L_DEBUG, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
+                                               log(L_DEBUG2, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
                                                 cached_cnt--;
                                                 free_refcounted_ureq(cp);
                                         } else if (srv == -1) {
@@ -1200,8 +1340,7 @@ static void age_cache(unsigned now_ms, int srv)
                 cp++;
         } while (--i);
         log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
-       if (srv == -1 || !now_ms)
-               log(L_DEBUG, "aging interval now %u ms", aging_interval_ms);
+       log(L_DEBUG2, "aging interval now %u ms", aging_interval_ms);
  }
  
  
@@ -1213,8 +1352,6 @@ static void age_cache(unsigned now_ms, int srv)
  /* Returns stdout fd of the worker, in blocking mode */
  static int create_and_feed_worker(user_req *ureq)
  {
-       static const char *const argv[] = { "worker_nscd", NULL };
-
         pid_t pid;
         struct {
                 int rd;
@@ -1229,15 +1366,25 @@ static int create_and_feed_worker(user_req *ureq)
         if (pid < 0) /* error */
                 perror_and_die("vfork");
         if (!pid) { /* child */
+               char param[sizeof(int)*3 + 2];
+               char *argv[3];
+
                 close(to_child.wr);
                 close(to_parent.rd);
                 xmovefd(to_child.rd, 0);
                 xmovefd(to_parent.wr, 1);
+               sprintf(param, "%u", debug);
+               argv[0] = (char*) "worker_nscd";
+               argv[1] = param;
+               argv[2] = NULL;
                 /* Re-exec ourself, cleaning up all allocated memory.
                  * fds in parent are marked CLOEXEC and will be closed too
                  * (modulo bugs) */
-               execve("/proc/self/exe", (char**)argv, (char**)(argv+1));
-               xexecve(self_exe_points_to, (char**)argv, (char**)(argv+1));
+               /* Try link name first: it's better to have comm field
+                * of "nscd" than "exe" (pgrep reported to fail to find us
+                * by name when comm field contains "exe") */
+               execve(self_exe_points_to, argv, argv+2);
+               xexecve("/proc/self/exe", argv, argv+2);
         }
  
         /* parent */
@@ -1250,6 +1397,7 @@ static int create_and_feed_worker(user_req *ureq)
         xsafe_write(to_child.wr, ureq, ureq_size(ureq));
  
         close(to_child.wr);
+       close_on_exec(to_parent.rd);
         return to_parent.rd;
  }
  
@@ -1291,12 +1439,14 @@ static void worker_signal_handler(int sig)
         _exit(0);
  }
  
-static void worker(void) NORETURN;
-static void worker(void)
+static void worker(const char *param) NORETURN;
+static void worker(const char *param)
  {
         user_req ureq;
         void *resp;
  
+       debug = atoi(param);
+
         worker_ureq = &ureq; /* for signal handler */
  
         /* Make sure we won't hang, but rather die */
@@ -1315,7 +1465,9 @@ static void worker(void)
         signal(SIGILL,    worker_signal_handler);
         signal(SIGFPE,    worker_signal_handler);
         signal(SIGABRT,   worker_signal_handler);
+#ifdef SIGSTKFLT
         signal(SIGSTKFLT, worker_signal_handler);
+#endif
  
         if (ureq.type == GETHOSTBYNAME
          || ureq.type == GETHOSTBYNAMEv6
@@ -1332,7 +1484,11 @@ static void worker(void)
                         (ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
                 ));
         } else if (ureq.type == GETPWBYNAME) {
-               resp = marshal_passwd(getpwnam(ureq.reqbuf));
+               struct passwd *pw;
+               log(L_DEBUG2, "getpwnam('%s')", ureq.reqbuf);
+               pw = getpwnam(ureq.reqbuf);
+               log(L_DEBUG2, "getpwnam result:%p", pw);
+               resp = marshal_passwd(pw);
         } else if (ureq.type == GETPWBYUID) {
                 resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
         } else if (ureq.type == GETGRBYNAME) {
@@ -1364,27 +1520,39 @@ static void worker(void)
  ** Main loop
  */
  
-static const char check_filenames[][sizeof("/etc/passwd")] = {
-       [SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
-       [SRV_GROUP]  = "/etc/group",
-       [SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
+static const char *const checked_filenames[] = {
+       /* Note: compiler adds another \0 byte at the end of each array element,
+        * so there are TWO \0's there.
+        */
+       [SRV_PASSWD] = "/etc/passwd\0" "/etc/passwd.cache\0" "/etc/shadow\0",
+       [SRV_GROUP]  = "/etc/group\0"  "/etc/group.cache\0",
+       [SRV_HOSTS]  = "/etc/hosts\0"  "/etc/hosts.cache\0"  "/etc/resolv.conf\0"  "/etc/nsswitch.conf\0",
+       /* ("foo.cache" files are maintained by libnss-cache) */
  };
  
-static struct stat check_statbuf[sizeof(check_filenames) / sizeof(check_filenames[0])];
+static long checked_status[ARRAY_SIZE(checked_filenames)];
  
  static void check_files(int srv)
  {
-       const char *file = check_filenames[srv];
-       struct stat *sb = &check_statbuf[srv];
-       struct stat tempbuf;
-       
-       memset(&tempbuf, 0, sizeof(tempbuf));
-       stat(file, &tempbuf); /* ignore errors */
-       tempbuf.st_atime = 0; /* this is not a change */
-       if (memcmp(sb, &tempbuf, sizeof(tempbuf)) != 0) {
-               log(L_INFO, "detected change in %s", file);
-               memcpy(sb, &tempbuf, sizeof(tempbuf));
-               age_cache(0, srv); /* frees entire cache */
+       struct stat tsb;
+       const char *file = checked_filenames[srv];
+       long v;
+
+       v = 0;
+       do {
+               memset(&tsb, 0, sizeof(tsb));
+               stat(file, &tsb); /* ignore errors */
+               /* Comparing struct stat's was giving false positives.
+                * Extracting only those fields which are interesting:
+                */
+               v ^= (long)tsb.st_mtime ^ (long)tsb.st_size ^ (long)tsb.st_ino; /* ^ (long)tsb.st_dev ? */
+               file += strlen(file) + 1;
+       } while (file[0]);
+
+       if (v != checked_status[srv]) {
+               checked_status[srv] = v;
+               log(L_INFO, "detected change in files related to service %d", srv);
+               age_cache(/*free_all:*/ 1, srv);
         }
  }
  
@@ -1393,43 +1561,87 @@ static int handle_client(int i)
  {
         int srv;
         user_req *ureq = cinfo[i].ureq;
-       user_req **cell;
+       user_req **cache_pp;
         user_req *ureq_and_resp;
  
+#if DEBUG_BUILD
         log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
-                       ureq->version, ureq->type, typestr[ureq->type],
+                       ureq->version, ureq->type,
+                       ureq->type < ARRAY_SIZE(typestr) ? typestr[ureq->type] : "?",
                         ureq->key_len, req_str(ureq->type, ureq->reqbuf));
-       hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+#endif
  
-       if (unsupported_ureq_type(ureq->type)) {
-               /* We don't know this request. Just close the connection */
-               /* (glibc client interprets this like "not supported by this nscd") */
-               log(L_INFO, "unsupported query, dropping");
+       if (ureq->version != NSCD_VERSION) {
+               log(L_INFO, "wrong version");
                 close_client(i);
                 return 0;
         }
-       srv = type_to_srv[ureq->type];
-       if (!config.srv_enable[srv]) {
-               log(L_INFO, "service %d is disabled, dropping", srv);
+       if (ureq->key_len > sizeof(ureq->reqbuf)) {
+               log(L_INFO, "bogus key_len %u - ignoring", ureq->key_len);
                 close_client(i);
                 return 0;
         }
-
         if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
-               log(L_INFO, "read %d, need %d more to read",
+               log(L_INFO, "read %d, need to read %d",
                         cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
                 return 0; /* more to read */
         }
         if (cinfo[i].bytecnt > USER_HDR_SIZE + ureq->key_len) {
-               log(L_INFO, "read overflow");
+               log(L_INFO, "read overflow: %u > %u",
+                       (int)cinfo[i].bytecnt, (int)(USER_HDR_SIZE + ureq->key_len));
                 close_client(i);
                 return 0;
         }
-       if (ureq->version != NSCD_VERSION) {
-               log(L_INFO, "wrong version");
+       if (unsupported_ureq_type(ureq->type)) {
+               /* We don't know this request. Just close the connection.
+                * (glibc client interprets this like "not supported by this nscd")
+                * Happens very often, thus DEBUG, not INFO */
+               log(L_DEBUG, "unsupported query, dropping");
                 close_client(i);
                 return 0;
         }
+       srv = type_to_srv[ureq->type];
+       if (!config.srv_enable[srv]) {
+               log(L_INFO, "service %d is disabled, dropping", srv);
+               close_client(i);
+               return 0;
+       }
+
+       hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+
+       if (ureq->type == SHUTDOWN
+        || ureq->type == INVALIDATE
+       ) {
+#ifdef SO_PEERCRED
+               struct ucred caller;
+               socklen_t optlen = sizeof(caller);
+               if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
+                       log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
+                       close_client(i);
+                       return 0;
+               }
+               if (caller.uid != 0) {
+                       log(L_INFO, "special request from non-root - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+#endif
+               if (ureq->type == SHUTDOWN) {
+                       log(L_INFO, "got shutdown request, exiting");
+                       exit(0);
+               }
+               if (!ureq->key_len || ureq->reqbuf[ureq->key_len - 1]) {
+                       log(L_INFO, "malformed invalidate request - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+               log(L_INFO, "got invalidate request, flushing cache");
+               /* Frees entire cache. TODO: replace -1 with service (in ureq->reqbuf) */
+               age_cache(/*free_all:*/ 1, -1);
+               close_client(i);
+               return 0;
+       }
+
         if (ureq->type != GETHOSTBYADDR
          && ureq->type != GETHOSTBYADDRv6
         ) {
@@ -1444,104 +1656,177 @@ static int handle_client(int i)
                 check_files(srv);
         }
  
-       /* If in cache, save ptr to response into cinfo and return */
-       ureq_and_resp = find_cell_and_response(&cell, ureq);
+       cache_pp = lookup_in_cache(ureq);
+       ureq_and_resp = cache_pp ? *cache_pp : NULL;
+
         if (ureq_and_resp) {
-               response_header *resp = ureq_response(ureq_and_resp);
-               unsigned sz = resp->version_or_size;
-               log(L_DEBUG, "sz:%u", sz);
-               hex_dump(resp, sz);
-               ureq_and_resp->refcount = 1; /* cache shouldn't free it under us! */
-               pfd[i].events = POLLOUT; /* we want to write out */
-               cinfo[i].resptr = ureq_and_resp;
-               cinfo[i].respos = 0;
-               //cinfo[i].resp_sz = sz;
-               return 1;
+               if (CACHED_ENTRY(ureq_and_resp)) {
+                       /* Found. Save ptr to response into cinfo and return */
+                       response_header *resp = ureq_response(ureq_and_resp);
+                       unsigned sz = resp->version_or_size;
+
+                       log(L_DEBUG, "sz:%u", sz);
+                       hex_dump(resp, sz);
+                       /* cache shouldn't free it under us! */
+                       if (++ureq_and_resp->refcount == 0) {
+                               error_and_die("BUG! ++%p.refcount rolled over to 0, exiting", ureq_and_resp);
+                       }
+                       log(L_DEBUG2, "++%p.refcount=%u", ureq_and_resp, ureq_and_resp->refcount);
+                       pfd[i].events = POLLOUT; /* we want to write out */
+                       cinfo[i].resptr = ureq_and_resp;
+                       /*cinfo[i].respos = 0; - already is */
+                       /* prevent future matches with anything */
+                       cinfo[i].cache_pp = (void *) 1;
+                       return 1; /* "ready to write data out to client" */
+               }
+
+               /* Not found. Remember a pointer where it will appear */
+               cinfo[i].cache_pp = cache_pp;
+
+               /* If it does not point to our own ureq buffer... */
+               if (CACHE_PTR(ureq_and_resp) != ureq) {
+                       /* We are not the first client who wants this */
+                       log(L_DEBUG, "another request is in progress (%p), waiting for its result", ureq_and_resp);
+                       MARK_PTR_SHARED(cache_pp); /* "please inform us when it's ready" */
+                       /* "we do not wait for client anymore" */
+                       cinfo[i].client_fd = pfd[i].fd;
+                       /* Don't wait on fd. Worker response will unblock us */
+                       pfd[i].events = 0;
+                       return 0;
+               }
+               /* else: lookup_in_cache inserted (ureq & 1) into *cache_pp:
+                * we are the first client to miss on this ureq. */
         }
  
         /* Start worker thread */
-       cinfo[i].cell = cell;
+       log(L_DEBUG, "stored %p in cache, starting a worker", ureq_and_resp);
         /* Now we will wait on worker's fd, not client's! */
         cinfo[i].client_fd = pfd[i].fd;
         pfd[i].fd = create_and_feed_worker(ureq);
-
-/* We can do it here, but we don't really need to.
- * We need to have client_buf[] big enough anyway for worst case scenario,
- * so we can simply keep cbuf allocated until we close a client.
-       cinfo[i].ureq = NULL;
-       busy_cbuf[cinfo[i].bufidx] = 0;
- */
         return 0;
  }
  
-/* When we return, reply is fully read and stored in cache,
+static void prepare_for_writeout(unsigned i, user_req *cached)
+{
+       log(L_DEBUG2, "client %u: data is ready at %p", i, cached);
+
+       if (cinfo[i].client_fd) {
+               pfd[i].fd = cinfo[i].client_fd;
+               cinfo[i].client_fd = 0; /* "we don't wait for worker reply" */
+       }
+       pfd[i].events = POLLOUT;
+
+       /* Writeout position etc */
+       cinfo[i].resptr = cached;
+       /*cinfo[i].respos = 0; - already is */
+       /* if worker took some time to get info (e.g. DNS query),
+        * prevent client timeout from triggering at once */
+       cinfo[i].started_ms = g_now_ms;
+}
+
+/* Worker seems to be ready to write the response.
+ * When we return, response is fully read and stored in cache,
   * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
-static void handle_worker_response(int i, unsigned now_ms)
+static void handle_worker_response(int i)
  {
-       response_header sz_and_found;
+       struct { /* struct response_header + small body */
+               uint32_t version_or_size;
+               int32_t found;
+               char body[256 - 8];
+       } sz_and_found;
         user_req *cached;
-       user_req *ureq = cinfo[i].ureq;
+       user_req *ureq;
         response_header *resp;
-       unsigned resp_sz;
-       unsigned ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
-
-       /* Replies can be big (getgrnam("guest") on a big user db),
-        * we cannot rely on them being atomic. However, we know that worker
-        * _always_ gives reply in one full_write(), so loop and read it all
-        * (looping is implemented inside full_read()) */
-       resp_sz = full_read(pfd[i].fd, &sz_and_found, 8);
-       if (resp_sz != 8) {
+       unsigned sz, resp_sz;
+       unsigned ureq_sz_aligned;
+
+       cached = NULL;
+       ureq = cinfo[i].ureq;
+       ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
+
+       sz = full_read(pfd[i].fd, &sz_and_found, sizeof(sz_and_found));
+       if (sz < 8) {
                 /* worker was killed? */
-               log(L_DEBUG, "worker gave short reply:%u != 8", resp_sz);
+               log(L_DEBUG, "worker gave short reply:%u < 8", sz);
                 goto err;
         }
  
         resp_sz = sz_and_found.version_or_size;
-       if (resp_sz < 8 || resp_sz > 0xfffffff) { /* 256 mb */
+       if (resp_sz < sz || resp_sz > 0x0fffffff) { /* 256 mb */
                 error("BUG: bad size from worker:%u", resp_sz);
                 goto err;
         }
  
         /* Create new block of cached info */
         cached = xzalloc(ureq_sz_aligned + resp_sz);
-       resp = (void*) ((char*)cached + ureq_sz_aligned);
+       log(L_DEBUG2, "xzalloc(%u):%p sz:%u resp_sz:%u found:%u",
+                       ureq_sz_aligned + resp_sz, cached,
+                       sz, resp_sz,
+                       (int)sz_and_found.found
+                       );
+       resp = (void*) (((char*) cached) + ureq_sz_aligned);
         memcpy(cached, ureq, ureq_size(ureq));
-       resp->version_or_size = resp_sz;
-       resp->found = sz_and_found.found;
-       if (sz_and_found.found) {
+       memcpy(resp, &sz_and_found, sz);
+       if (sz_and_found.found && resp_sz > sz) {
                 /* We need to read data only if it's found
-                * (otherwise worker sends only 8 bytes) */
-               if (full_read(pfd[i].fd, resp->body, resp_sz - 8) != resp_sz - 8) {
+                * (otherwise worker sends only 8 bytes).
+                *
+                * Replies can be big (getgrnam("guest") on a big user db),
+                * we cannot rely on them being atomic. However, we know
+                * that worker _always_ gives reply in one full_write(),
+                * so we loop and read it all
+                * (looping is implemented inside full_read())
+                */
+               if (full_read(pfd[i].fd, ((char*) resp) + sz, resp_sz - sz) != resp_sz - sz) {
                         /* worker was killed? */
-                       log(L_DEBUG, "worker gave short reply");
-                       free(cached);
+                       log(L_DEBUG, "worker gave short reply, free(%p)", cached);
   err:
+                       free(cached);
                         cached = NULL;
                         goto wo;
                 }
         }
+       set_cache_timestamp(cached);
         hex_dump(resp, resp_sz);
-       /* save in cache */
-       cached->refcount = 1; /* cache shouldn't free it under us! */
-       set_cache_timestamp(cached, now_ms);
-       save_in_cell(cinfo[i].cell, cached, now_ms);
  
   wo:
         close(pfd[i].fd);
  
-       /* schedule for writeout */
-       pfd[i].fd = cinfo[i].client_fd;
-       cinfo[i].client_fd = 0; /* no, we don't wait for worker reply anymore */
-       pfd[i].events = POLLOUT;
-       /* pfd[i].revents = 0; - not needed? */
+       /* Save in cache */
+       unsigned ref = 0;
+       user_req **cache_pp = cinfo[i].cache_pp;
+       if (cache_pp != NULL) { /* if not a fake entry */
+               ureq = *cache_pp;
+               *cache_pp = cached;
+               if (CACHE_SHARED(ureq)) {
+                       /* Other clients wait for this response too,
+                        * wake them (and us) up and set refcount = no_of_clients */
+                       unsigned j;
+
+                       for (j = 2; j < num_clients; j++) {
+                               if (cinfo[j].cache_pp == cache_pp) {
+                                       /* This client uses the same cache entry */
+                                       ref++;
+                                       /* prevent future matches with anything */
+                                       cinfo[j].cache_pp = (void *) 1;
+                                       prepare_for_writeout(j, cached);
+                               }
+                       }
+                       goto ret;
+               }
+               /* prevent future matches with anything */
+               cinfo[i].cache_pp = (void *) 1;
+               ref = 1;
+       }
  
-       /* writeout position etc */
-       cinfo[i].resptr = cached;
-       cinfo[i].respos = 0;
-       //cinfo[i].resp_sz = resp_sz;
-       /* if worker took some time to get info (e.g. DNS query),
-        * prevent client timeout from triggering at once */
-       cinfo[i].started_ms = now_ms;
+       prepare_for_writeout(i, cached);
+ret:
+       /* cache shouldn't free it under us! */
+       if (cached) {
+               cached->refcount = ref;
+               log(L_DEBUG2, "%p.refcount=%u", cached, ref);
+       }
+       aging_interval_ms = min_aging_interval_ms;
  }
  
  static void main_loop(void)
@@ -1556,26 +1841,32 @@ static void main_loop(void)
         while (1) {
                 int i, j;
                 int r;
-               unsigned now_ms;
  
                 r = SMALL_POLL_TIMEOUT_MS;
                 if (num_clients <= 2 && !cached_cnt)
                         r = -1; /* infinite */
                 else if (num_clients < max_reqnum)
                         r = aging_interval_ms;
-
  #if 0 /* Debug: leak detector */
                 {
                         static unsigned long long cnt;
-                       void *p = malloc(240); /* should not be too small */
+                       static unsigned long low_malloc = -1L;
+                       static unsigned long low_sbrk = -1L;
+                       void *p = malloc(540); /* should not be too small */
                         void *s = sbrk(0);
                         free(p);
-                       log(L_INFO, "entering poll %llu (%d ms). num_clients:%u cached:%u %u/%u next malloc:%p, sbrk:%p",
-                               cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt, p, s);
+                       if ((unsigned long)p < low_malloc)
+                               low_malloc = (unsigned long)p;
+                       if ((unsigned long)s < low_sbrk)
+                               low_sbrk = (unsigned long)s;
+                       log(L_INFO, "poll %llu (%d ms). clients:%u cached:%u %u/%u malloc:%p (%lu), sbrk:%p (%lu)",
+                               cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt,
+                               p, (unsigned long)p - low_malloc,
+                               s, (unsigned long)s - low_sbrk);
                         cnt++;
                 }
  #else
-               log(L_DEBUG, "entering poll (%d ms). num_clients:%u cached:%u hit_ratio:%u/%u",
+               log(L_DEBUG, "poll %d ms. clients:%u cached:%u hit ratio:%u/%u",
                                 r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
  #endif
  
@@ -1591,7 +1882,7 @@ static void main_loop(void)
                  * There is no blocking I/O (except when we talk to worker thread
                  * which is guaranteed to not block us for long) */
  
-               now_ms = monotonic_ms();
+               g_now_ms = monotonic_ms();
                 if (r == 0)
                         goto skip_fd_checks;
  
@@ -1609,13 +1900,14 @@ static void main_loop(void)
                         ndelay_on(cfd);
                         close_on_exec(cfd);
                         /* x[num_clients] is next free element, taking it */
+                       log(L_DEBUG2, "new client %d, fd %d", num_clients, cfd);
                         pfd[num_clients].fd = cfd;
                         pfd[num_clients].events = POLLIN;
                         /* this will make us do read() in next for() loop: */
                         pfd[num_clients].revents = POLLIN;
                         memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
                         /* cinfo[num_clients].bytecnt = 0; - done */
-                       cinfo[num_clients].started_ms = now_ms;
+                       cinfo[num_clients].started_ms = g_now_ms;
                         cinfo[num_clients].bufidx = alloc_buf_no();
                         cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
                         num_clients++;
@@ -1644,14 +1936,17 @@ static void main_loop(void)
                                 resp = ureq_response(cinfo[i].resptr);
                                 resp_sz = resp->version_or_size;
                                 resp->version_or_size = NSCD_VERSION;
-                               r = safe_write(pfd[i].fd, resp + cinfo[i].respos, resp_sz - cinfo[i].respos);
+                               errno = 0;
+                               r = safe_write(pfd[i].fd, ((char*) resp) + cinfo[i].respos, resp_sz - cinfo[i].respos);
                                 resp->version_or_size = resp_sz;
  
-                               if (r < 0 && errno == EAGAIN)
+                               if (r < 0 && errno == EAGAIN) {
+                                       log(L_DEBUG, "client %u: EAGAIN on write", i);
                                         continue;
+                               }
                                 if (r <= 0) { /* client isn't there anymore */
- write_out_is_done:
-                                       free_refcounted_ureq(&cinfo[i].resptr);
+                                       log(L_DEBUG, "client %u is gone (write returned:%d err:%s)",
+                                                       i, r, errno ? strerror(errno) : "-");
                                         close_client(i);
                                         continue;
                                 }
@@ -1665,24 +1960,54 @@ static void main_loop(void)
                                          * read(3, "www.google.com\0\0", 16) = 16
                                          * close(3) = 0
                                          */
-                                       goto write_out_is_done;
+                                       log(L_DEBUG, "client %u: sent answer %u/%u/%u bytes", i, r, cinfo[i].respos, resp_sz);
+                                       close_client(i);
+                                       continue;
                                 }
+                               log(L_DEBUG, "client %u: sent partial answer %u/%u/%u bytes", i, r, cinfo[i].respos, resp_sz);
+                               continue;
                         }
  
                         /* "Read reply from worker" case. Worker may be
-                        * already dead, revents may contain other bits too */
+                        * already dead, revents may contain other bits too
+                        */
                         if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
                                 log(L_DEBUG, "reading response for client %u", i);
-                               handle_worker_response(i, now_ms);
+                               handle_worker_response(i);
                                 /* We can immediately try to write a response
                                  * to client */
                                 goto write_out;
                         }
  
+                       /* POLLHUP means pfd[i].fd is closed by peer.
+                        * POLLHUP+POLLOUT[+POLLERR] is seen when we writing out
+                        * and see that pfd[i].fd is closed by peer (for example,
+                        * it happens when client's result buffer is too small
+                        * to receive a huge GETGRBYNAME result).
+                        */
+                       if ((pfd[i].revents & ~(POLLOUT+POLLERR)) == POLLHUP) {
+                               int is_client = (cinfo[i].client_fd == 0 || cinfo[i].client_fd == pfd[i].fd);
+                               log(L_INFO, "%s %u disappeared (got POLLHUP on fd %d)",
+                                       is_client ? "client" : "worker",
+                                       i,
+                                       pfd[i].fd
+                               );
+                               if (is_client)
+                                       close_client(i);
+                               else {
+                                       /* Read worker output anyway, error handling
+                                        * in that function deals with short read.
+                                        * Simply closing client is wrong: it leaks
+                                        * shared future entries. */
+                                       handle_worker_response(i);
+                               }
+                               continue;
+                       }
+
                         /* All strange and unexpected cases */
                         if (pfd[i].revents != POLLIN) {
-                               /* Not just "can read" - prolly POLLHUP too */
-                               log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
+                               /* Not just "can read", but some other bits are there */
+                               log(L_INFO, "client %u revents is strange:0x%x", i, pfd[i].revents);
                                 close_client(i);
                                 continue;
                         }
@@ -1703,44 +2028,6 @@ static void main_loop(void)
                         }
                         cinfo[i].bytecnt += r;
                         if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
-                               if (cinfo[i].ureq->type == SHUTDOWN
-                                || cinfo[i].ureq->type == INVALIDATE
-                               ) {
-                                       const char *service;
-                                       unsigned len;
-#ifdef SO_PEERCRED
-                                       struct ucred caller;
-                                       socklen_t optlen = sizeof(caller);
-                                       if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
-                                               log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
-                                               close_client(i);
-                                               continue;
-                                       }
-                                       if (caller.uid != 0) {
-                                               log(L_INFO, "special request from non-root - ignoring");
-                                               close_client(i);
-                                               continue;
-                                       }
-#endif
-                                       if (cinfo[i].ureq->type == SHUTDOWN) {
-                                               log(L_INFO, "got shutdown request, exiting");
-                                               return; /* exits nscd */;
-                                       }
-                                       len = cinfo[i].ureq->key_len;
-                                       service = (char*)&cinfo[i].ureq + len;
-                                       if (sizeof(user_req_header) + len != cinfo[i].bytecnt
-                                        || !len
-                                        || service[len-1] != '\0'
-                                       ) {
-                                               log(L_INFO, "malformed invalidate request - ignoring");
-                                               close_client(i);
-                                               continue;
-                                       }
-                                       log(L_INFO, "got invalidate request, flushing cache");
-                                       age_cache(0, -1); /* frees entire cache. TODO: replace -1 with service */
-                                       close_client(i);
-                                       continue;
-                               }
                                 if (handle_client(i)) {
                                         /* Response is found in cache! */
                                         goto write_out;
@@ -1750,17 +2037,19 @@ static void main_loop(void)
  
   skip_fd_checks:
                 /* Age cache */
-               if ((now_ms - last_age_time) >= aging_interval_ms) {
-                       last_age_time = now_ms;
-                       age_cache(now_ms, -1);
+               if ((g_now_ms - last_age_time) >= aging_interval_ms) {
+                       last_age_time = g_now_ms;
+                       age_cache(/*free_all:*/ 0, -1);
                 }
  
                 /* Close timed out client connections */
                 for (i = 2; i < num_clients; i++) {
-                       if (pfd[i].fd && !cinfo[i].client_fd
-                        && (now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
+                       if (pfd[i].fd != 0 /* not closed yet? */
+                        && cinfo[i].client_fd == 0 /* do we still wait for client, not worker? */
+                        && (g_now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
                         ) {
-                               log(L_INFO, "timed out waiting for client %u, dropping", i);
+                               log(L_INFO, "timed out waiting for client %u (%u ms), dropping",
+                                       i, (unsigned)(g_now_ms - cinfo[i].started_ms));
                                 close_client(i);
                         }
                 }
@@ -1789,7 +2078,7 @@ static void main_loop(void)
  
   coalesce_done:
                 num_clients -= cnt_closed;
-               log(L_DEBUG, "removing %d closed clients. num_clients:%d", cnt_closed, num_clients);
+               log(L_DEBUG, "removing %d closed clients. clients:%d", cnt_closed, num_clients);
                 min_closed = INT_MAX;
                 cnt_closed = 0;
                 /* start accepting new connects */
@@ -1867,7 +2156,7 @@ static const struct option longopt[] = {
  };
  
  static const char *const help[] = {
-       "Do not daemonize; log to stderr",
+       "Do not daemonize; log to stderr (-dd: more verbosity)",
         "File to read configuration from",
         "Invalidate cache",
         "Shut the server down",
@@ -1914,7 +2203,7 @@ static void handle_logfile(const char *str, int srv)
  
  static void handle_debuglvl(const char *str, int srv)
  {
-       debug |= getnum(str);
+       debug |= (uint8_t) getnum(str);
  }
  
  static void handle_threads(const char *str, int srv)
@@ -2017,7 +2306,7 @@ static void parse_conffile(const char *conffile, int warn)
                 word = conf_words;
                 while (1) {
                         if (strcmp(word->str + 1, p) == 0) {
-                               int srv;
+                               int srv = 0;
                                 p = skip_whitespace(p + strlen(p) + 1);
                                 *skip_non_whitespace(p) = '\0';
                                 if (word->str[0] == 'S') {
@@ -2117,6 +2406,7 @@ static char* user_to_env_U(const char *user)
  
  
  /* not static - don't inline me, compiler! */
+void readlink_self_exe(void);
  void readlink_self_exe(void)
  {
         char buf[PATH_MAX + 1];
@@ -2147,8 +2437,7 @@ static void special_op(const char *arg)
  
         if (!arg) { /* shutdown */
                 xfull_write(sock, &ureq, sizeof(ureq));
-               dup2(2, 1);
-               error_and_die("sent shutdown request, exiting");
+               printf("sent shutdown request, exiting\n");
         } else { /* invalidate */
                 size_t arg_len = strlen(arg) + 1;
                 struct {
@@ -2160,38 +2449,66 @@ static void special_op(const char *arg)
                 reqdata.req.key_len = arg_len;
                 memcpy(reqdata.arg, arg, arg_len);
                 xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
-               dup2(2, 1);
-               error_and_die("sent invalidate(%s) request, exiting", arg);
+               printf("sent invalidate(%s) request, exiting\n", arg);
         }
+       exit(0);
  }
  
  
+/* Callback for glibc-2.15 */
+struct traced_file;
+static void do_nothing(size_t dbidx, struct traced_file *finfo)
+{
+       /* nscd from glibc-2.15 does something like this:
+       if (!dbs[dbidx].enabled || !dbs[dbidx].check_file)
+               return;
+       add_file_to_watch_list(finfo->fname);
+       */
+}
+
  /* This internal glibc function is called to disable trying to contact nscd.
- * We _are_ nscd, so we need to do the lookups, and not recurse. */
-void __nss_disable_nscd(void);
+ * We _are_ nscd, so we need to do the lookups, and not recurse.
+ * Until 2.14, this function was taking no parameters.
+ * In 2.15, it takes a function pointer from hell.
+ */
+void __nss_disable_nscd(void (*hell)(size_t, struct traced_file*));
+
  
  int main(int argc, char **argv)
  {
         int n;
+       unsigned opt_d_cnt;
         const char *env_U;
         const char *conffile;
  
         /* make sure we don't get recursive calls */
-       __nss_disable_nscd();
+       __nss_disable_nscd(do_nothing);
  
         if (argv[0][0] == 'w') /* "worker_nscd" */
-               worker();
+               worker(argv[1]);
  
         setlinebuf(stdout);
         setlinebuf(stderr);
  
+       /* Make sure stdio is not closed */
+       n = xopen3("/dev/null", O_RDWR, 0);
+       while (n < 2)
+               n = dup(n);
+       /* Close unexpected open file descriptors */
+       n |= 0xff; /* start from at least fd# 255 */
+       do {
+               close(n--);
+       } while (n > 2);
+
         /* For idiotic kernels which disallow "exec /proc/self/exe" */
         readlink_self_exe();
  
         conffile = default_conffile;
+       opt_d_cnt = 0;
         while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
                 switch (n) {
                 case 'd':
+                       opt_d_cnt++;
                         debug &= ~D_DAEMON;
                         break;
                 case 'f':
@@ -2219,6 +2536,12 @@ int main(int argc, char **argv)
                         print_help_and_die();
                 }
         }
+       /* Multiple -d can bump debug regardless of nscd.conf:
+        * no -d or -d: 0, -dd: 1,
+        * -ddd: 3, -dddd: 7, -ddddd: 15
+        */
+       if (opt_d_cnt != 0)
+               debug |= (((1U << opt_d_cnt) >> 1) - 1) & L_ALL;
  
         env_U = getenv("U");
         /* Avoid duplicate warnings if $U exists */
@@ -2244,37 +2567,27 @@ int main(int argc, char **argv)
                 if (putenv(user_to_env_U(config.user)))
                         error_and_die("out of memory");
                 /* fds leaked by nss will be closed by execed copy */
-               execv("/proc/self/exe", argv);
-               xexecve(self_exe_points_to, argv, environ);
+               execv(self_exe_points_to, argv);
+               xexecve("/proc/self/exe", argv, environ);
         }
  
         /* Allocate dynamically sized stuff */
         max_reqnum += 2; /* account for 2 first "fake" clients */
         if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
-       if (max_reqnum > 0xffff) max_reqnum = 0xffff;
-       log(L_DEBUG, "will handle %u requests in parallel", max_reqnum - 2);
+       /* Since refcount is a byte, can't serve more than 255-2 clients
+        * at once. The rest will block in connect() */
+       if (max_reqnum > 0xff) max_reqnum = 0xff;
         client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
         busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
         pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
         cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
  
         cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
-       if (cache_size < 64) cache_size = 64; /* 8*64 = 512 entries min */
+       if (cache_size < 8) cache_size = 8; /* 8*8 = 64 entries min */
         if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
         cache_size |= 1; /* force it to be odd */
-       log(L_DEBUG, "cache size %u x 8 entries", cache_size);
         cache = xzalloc(cache_size * sizeof(cache[0]));
  
-       /* Make sure stdio is not closed */
-       n = xopen3("/dev/null", O_RDWR, 0);
-       while (n < 2)
-               n = dup(n);
-       /* Close unexpected open file descriptors */
-       n |= 0xff; /* start from at least fd# 255 */
-       do {
-               close(n--);
-       } while (n > 2);
-
         /* Register cleanup hooks */
         signal(SIGINT, cleanup_on_signal);
         signal(SIGTERM, cleanup_on_signal);
@@ -2287,12 +2600,15 @@ int main(int argc, char **argv)
         signal(SIGALRM, SIG_DFL);
  #endif
  
-       mkdir(NSCD_DIR, 0777);
+       if (mkdir(NSCD_DIR, 0755) == 0) {
+               /* prevent bad mode of NSCD_DIR if umask is e.g. 077 */
+               chmod(NSCD_DIR, 0755);
+       }
         pfd[0].fd = open_socket(NSCD_SOCKET);
         pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
         pfd[0].events = POLLIN;
         pfd[1].events = POLLIN;
-       
+
         if (debug & D_DAEMON) {
                 daemon(/*nochdir*/ 1, /*noclose*/ 0);
                 if (config.logfile) {
@@ -2311,31 +2627,28 @@ int main(int argc, char **argv)
                 signal(SIGTSTP, SIG_IGN);
         }
  
+       log(L_ALL, "unscd v" PROGRAM_VERSION ", debug level 0x%x", debug & L_ALL);
+       log(L_DEBUG, "max %u requests in parallel", max_reqnum - 2);
+       log(L_DEBUG, "cache size %u x 8 entries", cache_size);
+
         if (env_U) {
                 int size;
                 gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
-               if (setgroups(size, &ug[1]))
-                       perror_and_die("cannot set groups for user '%s'", config.user);
-               if (setuid(ug[0]))
-                       perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
+               if (size > 1)
+                       if (setgroups(size - 1, &ug[1]) || setgid(ug[1]))
+                               perror_and_die("cannot set groups for user '%s'", config.user);
+               if (size > 0)
+                       if (setuid(ug[0]))
+                               perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
                 free(ug);
         }
  
-       log(L_ALL, "nscd v" PROGRAM_VERSION ", debug level %x", debug & L_ALL);
-       log(L_DEBUG, "passwd cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_PASSWD],
-                               config.pttl[SRV_PASSWD],
-                               config.nttl[SRV_PASSWD]);
-       log(L_DEBUG, " group cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_GROUP ],
-                               config.pttl[SRV_GROUP],
-                               config.nttl[SRV_GROUP]);
-       log(L_DEBUG, " hosts cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_HOSTS ],
-                               config.pttl[SRV_HOSTS],
-                               config.nttl[SRV_HOSTS]);
-
         for (n = 0; n < 3; n++) {
+               log(L_DEBUG, "%s cache enabled:%u pttl:%u nttl:%u",
+                               srv_name[n],
+                               config.srv_enable[n],
+                               config.pttl[n],
+                               config.nttl[n]);
                 config.pttl[n] *= 1000;
                 config.nttl[n] *= 1000;
         }