]> git.donarmstrong.com Git - unscd.git/blobdiff - nscd.c
* update compatibility level to 5
[unscd.git] / nscd.c
diff --git a/nscd.c b/nscd.c
index 3037240a811c84fe52248d46679111295af4c6dc..29494ff89771f365b8c6dd76fe5971ab44c4cf50 100644 (file)
--- a/nscd.c
+++ b/nscd.c
@@ -17,7 +17,7 @@
 /*
 Build instructions:
 
 /*
 Build instructions:
 
-gcc -Os -o nscd nscd.c
+gcc -Wall -Wunused-parameter -Os -o nscd nscd.c
 
 gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
       -Os -o nscd nscd.c
 
 gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
       -Os -o nscd nscd.c
@@ -29,7 +29,7 @@ quite a bit of bugs in it. This leads people to invent babysitters
 which restart crashed/hung nscd. This is ugly.
 
 After looking at nscd source in glibc I arrived to the conclusion
 which restart crashed/hung nscd. This is ugly.
 
 After looking at nscd source in glibc I arrived to the conclusion
-that its desidn is contributing to this significantly. Even if nscd's
+that its design is contributing to this significantly. Even if nscd's
 code is 100.00% perfect and bug-free, it can still suffer from bugs
 in libraries it calls.
 
 code is 100.00% perfect and bug-free, it can still suffer from bugs
 in libraries it calls.
 
@@ -42,7 +42,7 @@ or file descriptor leaks and other bugs.
 
 Since nscd is multithreaded program with single shared cache,
 any resource leak in any NSS library has cumulative effect.
 
 Since nscd is multithreaded program with single shared cache,
 any resource leak in any NSS library has cumulative effect.
-Even if an NSS library leaks a file descriptor 0.01% of the time,
+Even if a NSS library leaks a file descriptor 0.01% of the time,
 this will make nscd crash or hang after some time.
 
 Of course bugs in NSS .so modules should be fixed, but meanwhile
 this will make nscd crash or hang after some time.
 
 Of course bugs in NSS .so modules should be fixed, but meanwhile
@@ -120,8 +120,27 @@ vda.linux@googlemail.com
  * 0.36 excercise extreme paranoia handling server-user option;
  *      a little bit more verbose logging:
  *      L_DEBUG2 log level added, use debug-level 7 to get it
  * 0.36 excercise extreme paranoia handling server-user option;
  *      a little bit more verbose logging:
  *      L_DEBUG2 log level added, use debug-level 7 to get it
+ * 0.37 users reported over-zealous "detected change in /etc/passwd",
+ *      apparently stat() returns random garbage in unused padding
+ *      on some systems. Made the check less paranoid.
+ * 0.38 log POLLHUP better
+ * 0.39 log answers to client better, log getpwnam in the worker,
+ *      pass debug level value down to worker.
+ * 0.40   fix handling of shutdown and invalidate requests;
+ *        fix bug with answer written in several pieces
+ * 0.40.1 set hints.ai_socktype = SOCK_STREAM in GETAI request
+ * 0.41   eliminate double caching of two near-simultaneous identical requests -
+ *        EXPERIMENTAL
+ * 0.42   execute /proc/self/exe by link name first (better comm field)
+ * 0.43   fix off-by-one error in setgroups
+ * 0.44   make -d[ddd] bump up debug - easier to explain to users
+ *        how to produce detailed log (no nscd.conf tweaking)
+ * 0.45   Fix out-of-bounds array access and log/pid file permissions -
+ *        thanks to Sebastian Krahmer (krahmer AT suse.de)
+ * 0.46   fix a case when we forgot to remove a future entry on worker failure
+ * 0.47   fix nscd without -d to not bump debug level
  */
  */
-#define PROGRAM_VERSION "0.36"
+#define PROGRAM_VERSION "0.47"
 
 #define DEBUG_BUILD 1
 
 
 #define DEBUG_BUILD 1
 
@@ -130,6 +149,8 @@ vda.linux@googlemail.com
 ** Generic helpers
 */
 
 ** Generic helpers
 */
 
+#define ARRAY_SIZE(x) ((unsigned)(sizeof(x) / sizeof((x)[0])))
+
 #define NORETURN __attribute__ ((__noreturn__))
 
 
 #define NORETURN __attribute__ ((__noreturn__))
 
 
@@ -473,6 +494,12 @@ enum {
        SRV_HOSTS,
 };
 
        SRV_HOSTS,
 };
 
+static const char srv_name[3][7] = {
+       "passwd",
+       "group",
+       "hosts"
+};
+
 static struct {
        const char *logfile;
        const char *user;
 static struct {
        const char *logfile;
        const char *user;
@@ -503,9 +530,9 @@ static const char *self_exe_points_to = "/proc/self/exe";
 
 /* Header common to all requests */
 #define USER_REQ_STRUCT \
 
 /* Header common to all requests */
 #define USER_REQ_STRUCT \
-       int32_t version; /* Version number of the daemon interface */ \
-       int32_t type;    /* Service requested */ \
-       int32_t key_len; /* Key length */
+       uint32_t version; /* Version number of the daemon interface */ \
+       uint32_t type;    /* Service requested */ \
+       uint32_t key_len; /* Key length */
 
 typedef struct user_req_header {
        USER_REQ_STRUCT
 
 typedef struct user_req_header {
        USER_REQ_STRUCT
@@ -528,7 +555,7 @@ typedef struct user_req {
                        USER_REQ_STRUCT
                };
                struct { /* when stored in cache, overlaps .version */
                        USER_REQ_STRUCT
                };
                struct { /* when stored in cache, overlaps .version */
-                       unsigned refcount:8; /* actually, can be 1 or 0 only */
+                       unsigned refcount:8;
                        /* (timestamp24 * 256) == timestamp in ms */
                        unsigned timestamp24:24;
                };
                        /* (timestamp24 * 256) == timestamp in ms */
                        unsigned timestamp24:24;
                };
@@ -607,16 +634,12 @@ static int unsupported_ureq_type(unsigned type)
 {
        if (type == GETAI) return 0;
        if (type == INITGROUPS) return 0;
 {
        if (type == GETAI) return 0;
        if (type == INITGROUPS) return 0;
-       if (type > GETHOSTBYADDRv6) return 1;
+       if (type == GETSTAT) return 1;
+       if (type > INVALIDATE) return 1;
        return 0;
 }
 
        return 0;
 }
 
-/* Possible reductions:
- * fd, bufidx - uint8_t
- * started_ms -> uint16_t started_s
- * ureq - eliminate (derivable from bufidx?)
- * cell - eliminate (derivable from resptr?)
- */
+
 typedef struct client_info {
        /* if client_fd != 0, we are waiting for the reply from worker
         * on pfd[i].fd, and client_fd is saved client's fd
 typedef struct client_info {
        /* if client_fd != 0, we are waiting for the reply from worker
         * on pfd[i].fd, and client_fd is saved client's fd
@@ -626,12 +649,12 @@ typedef struct client_info {
        unsigned bufidx;        /* buffer# in global client_buf[] */
        unsigned started_ms;
        unsigned respos;        /* response */
        unsigned bufidx;        /* buffer# in global client_buf[] */
        unsigned started_ms;
        unsigned respos;        /* response */
-       //unsigned resp_sz;
        user_req *resptr;       /* response */
        user_req *resptr;       /* response */
+       user_req **cache_pp;    /* cache entry address */
        user_req *ureq;         /* request (points to client_buf[x]) */
        user_req *ureq;         /* request (points to client_buf[x]) */
-       user_req **cell;        /* cache cell ptr */
 } client_info;
 
 } client_info;
 
+static unsigned g_now_ms;
 static int min_closed = INT_MAX;
 static int cnt_closed = 0;
 static int num_clients = 2; /* two listening sockets are "clients" too */
 static int min_closed = INT_MAX;
 static int cnt_closed = 0;
 static int num_clients = 2; /* two listening sockets are "clients" too */
@@ -639,25 +662,86 @@ static int num_clients = 2; /* two listening sockets are "clients" too */
 /* We read up to max_reqnum requests in parallel */
 static unsigned max_reqnum = 14;
 static int next_buf;
 /* We read up to max_reqnum requests in parallel */
 static unsigned max_reqnum = 14;
 static int next_buf;
-/* Each of these points to [max_reqnum] sized array */
 static char          (*client_buf)[MAX_USER_REQ_SIZE];
 static char          *busy_cbuf;
 static struct pollfd *pfd;
 static client_info   *cinfo;
 
 static char          (*client_buf)[MAX_USER_REQ_SIZE];
 static char          *busy_cbuf;
 static struct pollfd *pfd;
 static client_info   *cinfo;
 
+/* Request, response and cache data structures:
+ *
+ * cache[] (defined later):
+ * cacheline_t cache[cache_size] array, or in other words,
+ * user_req* cache[cache_size][8] array.
+ * Every client request is hashed, hash value determines which cache[x]
+ * will have the response stored in one of its 8 elements.
+ * Cache entries have this format: request, then padding to 32 bits,
+ * then the response.
+ * Addresses in cache[x][y] may be NULL or:
+ * (&client_buf[z]) & 1: the cache miss is in progress ("future entry"):
+ * "the data is not in the cache (yet), wait for it to appear"
+ * (&client_buf[z]) & 3: the cache miss is in progress and other clients
+ * also want the same data ("shared future entry")
+ * else (non-NULL but low two bits are 0): cached data in malloc'ed block
+ *
+ * Each of these is a [max_reqnum] sized array:
+ * pfd[i] - given to poll() to wait for requests and replies.
+ *      .fd: first two pfd[i]: listening Unix domain sockets, else
+ *      .fd: open fd to a client, for reading client's request, or
+ *      .fd: open fd to a worker, to send request and get response back
+ * cinfo[i] - auxiliary client data for pfd[i]
+ *      .client_fd: open fd to a client, in case we already had read its
+ *          request and got a cache miss, and created a worker or
+ *          wait for another client's worker.
+ *          Otherwise, it's 0 and client's fd is in pfd[i].fd
+ *      .bufidx: index in client_buf[] we store client's request in
+ *      .bytecnt: size of the request
+ *      .started_ms: used to time out unresponsive clients
+ *      .respos:
+ *      .resptr:
+ *      .cache_pp: &cache[x][y] where the response is, or will be stored.
+ *      .ureq:
+ * When a client has received its reply (or otherwise closed (timeout etc)),
+ * corresponding pfd[i] and cinfo[i] are removed by shifting [i+1], [i+2] etc
+ * elements down, so that both arrays never have free holes.
+ * [num_clients] is always the first free element.
+ *
+ * Each of these also is a [max_reqnum] sized array, but indexes
+ * do not correspond directly to pfd[i] and cinfo[i]:
+ * client_buf[n][MAX_USER_REQ_SIZE] - buffers we read client requests into
+ * busy_cbuf[n] - bool flags marking busy client_buf[]
+ */
+/* Possible reductions:
+ * fd, bufidx - uint8_t
+ * started_ms -> uint16_t started_s
+ * ureq - eliminate (derivable from bufidx?)
+ */
+
+/* Are special bits 0? is it a true cached entry? */
+#define CACHED_ENTRY(p)     ( ((long)(p) & 3) == 0 )
+/* Are special bits 11? is it a shared future cache entry? */
+#define CACHE_SHARED(p)     ( ((long)(p) & 3) == 3 )
+/* Return a ptr with special bits cleared (used for accessing data) */
+#define CACHE_PTR(p)        ( (void*) ((long)(p) & ~(long)3) )
+/* Return a ptr with special bits set to x1: make future cache entry ptr */
+#define MAKE_FUTURE_PTR(p)  ( (void*) ((long)(p) | 1) )
+/* Modify ptr, set special bits to 11: shared future cache entry */
+#define MARK_PTR_SHARED(pp) ( *(long*)(pp) |= 3 )
+
 static inline unsigned ureq_size(const user_req *ureq)
 {
        return sizeof(user_req_header) + ureq->key_len;
 }
 
 static inline unsigned ureq_size(const user_req *ureq)
 {
        return sizeof(user_req_header) + ureq->key_len;
 }
 
-static unsigned cache_age(unsigned now_ms, const user_req *ureq)
+static unsigned cache_age(const user_req *ureq)
 {
 {
-       return (uint32_t)now_ms - (ureq->timestamp24 << 8);
+       if (!CACHED_ENTRY(ureq))
+               return 0;
+       return (uint32_t) (g_now_ms - (ureq->timestamp24 << 8));
 }
 
 }
 
-static void set_cache_timestamp(user_req *ureq, unsigned now_ms)
+static void set_cache_timestamp(user_req *ureq)
 {
 {
-       ureq->timestamp24 = now_ms >> 8;
+       ureq->timestamp24 = g_now_ms >> 8;
 }
 
 static int alloc_buf_no(void)
 }
 
 static int alloc_buf_no(void)
@@ -679,11 +763,16 @@ static inline void *bufno2buf(int i)
        return client_buf[i];
 }
 
        return client_buf[i];
 }
 
-static void close_client(int i)
+static void close_client(unsigned i)
 {
 {
-       log(L_DEBUG, "closing client %d (fd %d)", i, pfd[i].fd);
+       log(L_DEBUG, "closing client %u (fd %u,%u)", i, pfd[i].fd, cinfo[i].client_fd);
+       /* Paranoia. We had nasty bugs where client was closed twice. */
+       if (pfd[i].fd == 0) ////
+               return;
        close(pfd[i].fd);
        close(pfd[i].fd);
-       pfd[i].fd = 0; /* flag as unused */
+       if (cinfo[i].client_fd && cinfo[i].client_fd != pfd[i].fd)
+               close(cinfo[i].client_fd);
+       pfd[i].fd = 0; /* flag as unused (coalescing needs this) */
        busy_cbuf[cinfo[i].bufidx] = 0;
        cnt_closed++;
        if (i < min_closed)
        busy_cbuf[cinfo[i].bufidx] = 0;
        cnt_closed++;
        if (i < min_closed)
@@ -692,7 +781,7 @@ static void close_client(int i)
 
 
 /*
 
 
 /*
-** ncsd API <-> C API conversion
+** nscd API <-> C API conversion
 */
 
 typedef struct response_header {
 */
 
 typedef struct response_header {
@@ -995,7 +1084,9 @@ static ai_response_header *obtain_addrinfo(const char *hostname)
 
        memset(&hints, 0, sizeof(hints));
        hints.ai_flags = AI_CANONNAME;
 
        memset(&hints, 0, sizeof(hints));
        hints.ai_flags = AI_CANONNAME;
-       /* hinst.ai_socktype = SOCK_STREAM; - can kill dups (one for each possible SOCK_xxx) */
+       /* kills dups (one for each possible SOCK_xxx) */
+       /* this matches glibc behavior */
+       hints.ai_socktype = SOCK_STREAM;
        ai = NULL; /* on failure getaddrinfo may leave it as-is */
        err = getaddrinfo(hostname, NULL, &hints, &ai);
 
        ai = NULL; /* on failure getaddrinfo may leave it as-is */
        err = getaddrinfo(hostname, NULL, &hints, &ai);
 
@@ -1085,108 +1176,118 @@ static uint32_t bernstein_hash(void *p, unsigned sz, uint32_t hash)
        return hash;
 }
 
        return hash;
 }
 
-static user_req *find_cell_and_response(user_req ***cellp, user_req *ureq)
+static void free_refcounted_ureq(user_req **ureqp)
+{
+       user_req *ureq = *ureqp;
+
+       if (!CACHED_ENTRY(ureq))
+               return;
+
+       if (ureq->refcount) {
+               ureq->refcount--;
+       } else {
+               log(L_DEBUG2, "refcount == 0, free(%p)", ureq);
+               free(ureq);
+       }
+       *ureqp = NULL;
+}
+
+static user_req **lookup_in_cache(user_req *ureq)
 {
 {
-       user_req **cell;
+       user_req **cacheline;
+       int free_cache;
        unsigned hash;
        unsigned i;
        unsigned ureq_sz = ureq_size(ureq);
 
        /* prevent overflow and division by zero */
        unsigned hash;
        unsigned i;
        unsigned ureq_sz = ureq_size(ureq);
 
        /* prevent overflow and division by zero */
-       if ((int)(cache_access_cnt+1) < 0) {
+       cache_access_cnt++;
+       if ((int)cache_access_cnt < 0) {
                cache_access_cnt = (cache_access_cnt >> 1) + 1;
                cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
        }
                cache_access_cnt = (cache_access_cnt >> 1) + 1;
                cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
        }
-       cache_access_cnt++;
 
        hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
 
        hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
-       log(L_DEBUG, "hash:%08x", hash);
+       log(L_DEBUG2, "hash:%08x", hash);
        hash = hash % cache_size;
        hash = hash % cache_size;
-       (*cellp) = cell = cache[hash];
+       cacheline = cache[hash];
 
 
+       free_cache = -1;
        for (i = 0; i < 8; i++) {
        for (i = 0; i < 8; i++) {
-               if (!cell[i])
+               user_req *cached = CACHE_PTR(cacheline[i]);
+               if (!cached) {
+                       if (free_cache == -1)
+                               free_cache = i;
                        continue;
                        continue;
-// TODO: do secondary hash match
+               }
                /* ureq->version is always 2 and is reused in cache
                 * for other purposes, we need to skip it here */
                /* ureq->version is always 2 and is reused in cache
                 * for other purposes, we need to skip it here */
-               if (memcmp(&ureq->type, &cell[i]->type, ureq_sz - offsetof(user_req, type)) == 0) {
+               if (memcmp(&ureq->type, &cached->type, ureq_sz - offsetof(user_req, type)) == 0) {
                        log(L_DEBUG, "found in cache[%u][%u]", hash, i);
                        cache_hit_cnt++;
                        log(L_DEBUG, "found in cache[%u][%u]", hash, i);
                        cache_hit_cnt++;
-                       return cell[i];
+                       return &cacheline[i];
                }
        }
                }
        }
-       log(L_DEBUG, "not found in cache[%u][x]", hash);
-       return NULL;
-}
 
 
-static void free_refcounted_ureq(user_req **ureqp)
-{
-       user_req *ureq = *ureqp;
-       /* is it in use? */
-       if (ureq->refcount) {
-               ureq->refcount = 0; /* since it can be only 1 or 0... */
-       } else {
-               free(ureq);
+       if (free_cache >= 0) {
+               cached_cnt++;
+               i = free_cache;
+               log(L_DEBUG, "not found, using free cache[%u][%u]", hash, i);
+               goto ret;
        }
        }
-       *ureqp = NULL;
-}
 
 
-static void save_in_cell(user_req **cell, user_req *new_cached, unsigned now_ms)
-{
        unsigned oldest_idx = 0;
        unsigned oldest_age = 0;
        unsigned oldest_idx = 0;
        unsigned oldest_age = 0;
-       unsigned age;
-       unsigned i;
-
        for (i = 0; i < 8; i++) {
        for (i = 0; i < 8; i++) {
-               if (!cell[i]) {
-                       log(L_DEBUG, "using free cache[x][%u]", i);
-                       cached_cnt++;
-                       cell[i] = new_cached;
-                       aging_interval_ms = min_aging_interval_ms;
-                       return;
-               }
-               age = cache_age(now_ms, cell[i]);
+               unsigned age = cache_age(cacheline[i]);
                if (age > oldest_age) {
                        oldest_age = age;
                        oldest_idx = i;
                }
        }
                if (age > oldest_age) {
                        oldest_age = age;
                        oldest_idx = i;
                }
        }
-       log(L_DEBUG, "freeing and reusing cache[x][%u] (age %u)", oldest_idx, oldest_age);
-       if (cell[oldest_idx]) {
-               free_refcounted_ureq(&cell[oldest_idx]);
-       } else {
-               cached_cnt++;
+       if (oldest_age == 0) {
+               /* All entries in cacheline are "future" entries!
+                * This is very unlikely, but we must still work correctly.
+                * We call this "fake cache entry".
+                * The data will be "cached" only for the duration
+                * of this client's request lifetime.
+                */
+               log(L_DEBUG, "not found, and cache[%u] is full: using fake cache entry", hash);
+               return NULL;
        }
        }
-       cell[oldest_idx] = new_cached;
-       aging_interval_ms = min_aging_interval_ms;
+       i = oldest_idx;
+       log(L_DEBUG, "not found, freeing and reusing cache[%u][%u] (age %u)", hash, i, oldest_age);
+       free_refcounted_ureq(&cacheline[i]);
+
+ ret:
+       cacheline[i] = MAKE_FUTURE_PTR(ureq);
+       return &cacheline[i];
 }
 
 }
 
-static void age_cache(unsigned now_ms, int srv)
+static void age_cache(unsigned free_all, int srv)
 {
        user_req **cp = *cache;
        int i;
        unsigned sv = cached_cnt;
 
 {
        user_req **cp = *cache;
        int i;
        unsigned sv = cached_cnt;
 
-       log(L_DEBUG, "aging cache, srv:%d, now:%u", srv, now_ms);
-       if (srv == -1 || !now_ms)
+       log(L_DEBUG, "aging cache, srv:%d, free_all:%u", srv, free_all);
+       if (srv == -1 || free_all)
                aging_interval_ms = INT_MAX;
        i = cache_size * 8;
        do {
                user_req *cached = *cp;
                aging_interval_ms = INT_MAX;
        i = cache_size * 8;
        do {
                user_req *cached = *cp;
-               if (cached) {
+               if (CACHED_ENTRY(cached) && cached != NULL) {
                        int csrv = type_to_srv[cached->type];
                        if (srv == -1 || srv == csrv) {
                        int csrv = type_to_srv[cached->type];
                        if (srv == -1 || srv == csrv) {
-                               if (!now_ms) {
+                               if (free_all) {
                                        cached_cnt--;
                                        free_refcounted_ureq(cp);
                                } else {
                                        cached_cnt--;
                                        free_refcounted_ureq(cp);
                                } else {
-                                       unsigned age = cache_age(now_ms, cached);
+                                       unsigned age = cache_age(cached);
                                        response_header *resp = ureq_response(cached);
                                        unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
                                        if (age >= ttl) {
                                        response_header *resp = ureq_response(cached);
                                        unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
                                        if (age >= ttl) {
-                                               log(L_DEBUG, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
+                                               log(L_DEBUG2, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
                                                cached_cnt--;
                                                free_refcounted_ureq(cp);
                                        } else if (srv == -1) {
                                                cached_cnt--;
                                                free_refcounted_ureq(cp);
                                        } else if (srv == -1) {
@@ -1200,8 +1301,7 @@ static void age_cache(unsigned now_ms, int srv)
                cp++;
        } while (--i);
        log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
                cp++;
        } while (--i);
        log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
-       if (srv == -1 || !now_ms)
-               log(L_DEBUG, "aging interval now %u ms", aging_interval_ms);
+       log(L_DEBUG2, "aging interval now %u ms", aging_interval_ms);
 }
 
 
 }
 
 
@@ -1213,8 +1313,6 @@ static void age_cache(unsigned now_ms, int srv)
 /* Returns stdout fd of the worker, in blocking mode */
 static int create_and_feed_worker(user_req *ureq)
 {
 /* Returns stdout fd of the worker, in blocking mode */
 static int create_and_feed_worker(user_req *ureq)
 {
-       static const char *const argv[] = { "worker_nscd", NULL };
-
        pid_t pid;
        struct {
                int rd;
        pid_t pid;
        struct {
                int rd;
@@ -1229,15 +1327,25 @@ static int create_and_feed_worker(user_req *ureq)
        if (pid < 0) /* error */
                perror_and_die("vfork");
        if (!pid) { /* child */
        if (pid < 0) /* error */
                perror_and_die("vfork");
        if (!pid) { /* child */
+               char param[sizeof(int)*3 + 2];
+               char *argv[3];
+
                close(to_child.wr);
                close(to_parent.rd);
                xmovefd(to_child.rd, 0);
                xmovefd(to_parent.wr, 1);
                close(to_child.wr);
                close(to_parent.rd);
                xmovefd(to_child.rd, 0);
                xmovefd(to_parent.wr, 1);
+               sprintf(param, "%u", debug);
+               argv[0] = (char*) "worker_nscd";
+               argv[1] = param;
+               argv[2] = NULL;
                /* Re-exec ourself, cleaning up all allocated memory.
                 * fds in parent are marked CLOEXEC and will be closed too
                 * (modulo bugs) */
                /* Re-exec ourself, cleaning up all allocated memory.
                 * fds in parent are marked CLOEXEC and will be closed too
                 * (modulo bugs) */
-               execve("/proc/self/exe", (char**)argv, (char**)(argv+1));
-               xexecve(self_exe_points_to, (char**)argv, (char**)(argv+1));
+               /* Try link name first: it's better to have comm field
+                * of "nscd" than "exe" (pgrep reported to fail to find us
+                * by name when comm field contains "exe") */
+               execve(self_exe_points_to, argv, argv+2);
+               xexecve("/proc/self/exe", argv, argv+2);
        }
 
        /* parent */
        }
 
        /* parent */
@@ -1250,6 +1358,7 @@ static int create_and_feed_worker(user_req *ureq)
        xsafe_write(to_child.wr, ureq, ureq_size(ureq));
 
        close(to_child.wr);
        xsafe_write(to_child.wr, ureq, ureq_size(ureq));
 
        close(to_child.wr);
+       close_on_exec(to_parent.rd);
        return to_parent.rd;
 }
 
        return to_parent.rd;
 }
 
@@ -1291,12 +1400,14 @@ static void worker_signal_handler(int sig)
        _exit(0);
 }
 
        _exit(0);
 }
 
-static void worker(void) NORETURN;
-static void worker(void)
+static void worker(const char *param) NORETURN;
+static void worker(const char *param)
 {
        user_req ureq;
        void *resp;
 
 {
        user_req ureq;
        void *resp;
 
+       debug = atoi(param);
+
        worker_ureq = &ureq; /* for signal handler */
 
        /* Make sure we won't hang, but rather die */
        worker_ureq = &ureq; /* for signal handler */
 
        /* Make sure we won't hang, but rather die */
@@ -1315,7 +1426,9 @@ static void worker(void)
        signal(SIGILL,    worker_signal_handler);
        signal(SIGFPE,    worker_signal_handler);
        signal(SIGABRT,   worker_signal_handler);
        signal(SIGILL,    worker_signal_handler);
        signal(SIGFPE,    worker_signal_handler);
        signal(SIGABRT,   worker_signal_handler);
+#ifdef SIGSTKFLT
        signal(SIGSTKFLT, worker_signal_handler);
        signal(SIGSTKFLT, worker_signal_handler);
+#endif
 
        if (ureq.type == GETHOSTBYNAME
         || ureq.type == GETHOSTBYNAMEv6
 
        if (ureq.type == GETHOSTBYNAME
         || ureq.type == GETHOSTBYNAMEv6
@@ -1332,7 +1445,11 @@ static void worker(void)
                        (ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
                ));
        } else if (ureq.type == GETPWBYNAME) {
                        (ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
                ));
        } else if (ureq.type == GETPWBYNAME) {
-               resp = marshal_passwd(getpwnam(ureq.reqbuf));
+               struct passwd *pw;
+               log(L_DEBUG2, "getpwnam('%s')", ureq.reqbuf);
+               pw = getpwnam(ureq.reqbuf);
+               log(L_DEBUG2, "getpwnam result:%p", pw);
+               resp = marshal_passwd(pw);
        } else if (ureq.type == GETPWBYUID) {
                resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
        } else if (ureq.type == GETGRBYNAME) {
        } else if (ureq.type == GETPWBYUID) {
                resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
        } else if (ureq.type == GETGRBYNAME) {
@@ -1364,27 +1481,30 @@ static void worker(void)
 ** Main loop
 */
 
 ** Main loop
 */
 
-static const char check_filenames[][sizeof("/etc/passwd")] = {
+static const char checked_filenames[][sizeof("/etc/passwd")] = {
        [SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
        [SRV_GROUP]  = "/etc/group",
        [SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
 };
 
        [SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
        [SRV_GROUP]  = "/etc/group",
        [SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
 };
 
-static struct stat check_statbuf[sizeof(check_filenames) / sizeof(check_filenames[0])];
+static long checked_status[ARRAY_SIZE(checked_filenames)];
 
 static void check_files(int srv)
 {
 
 static void check_files(int srv)
 {
-       const char *file = check_filenames[srv];
-       struct stat *sb = &check_statbuf[srv];
-       struct stat tempbuf;
-       
-       memset(&tempbuf, 0, sizeof(tempbuf));
-       stat(file, &tempbuf); /* ignore errors */
-       tempbuf.st_atime = 0; /* this is not a change */
-       if (memcmp(sb, &tempbuf, sizeof(tempbuf)) != 0) {
+       struct stat tsb;
+       const char *file = checked_filenames[srv];
+       long v;
+
+       memset(&tsb, 0, sizeof(tsb));
+       stat(file, &tsb); /* ignore errors */
+       /* Comparing struct stat's was giving false positives.
+        * Extracting only those fields which are interesting: */
+       v = (long)tsb.st_mtime ^ (long)tsb.st_size ^ (long)tsb.st_ino; /* ^ (long)tsb.st_dev ? */
+
+       if (v != checked_status[srv]) {
+               checked_status[srv] = v;
                log(L_INFO, "detected change in %s", file);
                log(L_INFO, "detected change in %s", file);
-               memcpy(sb, &tempbuf, sizeof(tempbuf));
-               age_cache(0, srv); /* frees entire cache */
+               age_cache(/*free_all:*/ 1, srv);
        }
 }
 
        }
 }
 
@@ -1393,30 +1513,28 @@ static int handle_client(int i)
 {
        int srv;
        user_req *ureq = cinfo[i].ureq;
 {
        int srv;
        user_req *ureq = cinfo[i].ureq;
-       user_req **cell;
+       user_req **cache_pp;
        user_req *ureq_and_resp;
 
        user_req *ureq_and_resp;
 
+#if DEBUG_BUILD
        log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
        log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
-                       ureq->version, ureq->type, typestr[ureq->type],
+                       ureq->version, ureq->type,
+                       ureq->type < ARRAY_SIZE(typestr) ? typestr[ureq->type] : "BAD",
                        ureq->key_len, req_str(ureq->type, ureq->reqbuf));
                        ureq->key_len, req_str(ureq->type, ureq->reqbuf));
-       hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+#endif
 
 
-       if (unsupported_ureq_type(ureq->type)) {
-               /* We don't know this request. Just close the connection */
-               /* (glibc client interprets this like "not supported by this nscd") */
-               log(L_INFO, "unsupported query, dropping");
+       if (ureq->version != NSCD_VERSION) {
+               log(L_INFO, "wrong version");
                close_client(i);
                return 0;
        }
                close_client(i);
                return 0;
        }
-       srv = type_to_srv[ureq->type];
-       if (!config.srv_enable[srv]) {
-               log(L_INFO, "service %d is disabled, dropping", srv);
+       if (ureq->key_len > sizeof(ureq->reqbuf)) {
+               log(L_INFO, "bogus key_len %u - ignoring", ureq->key_len);
                close_client(i);
                return 0;
        }
                close_client(i);
                return 0;
        }
-
        if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
        if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
-               log(L_INFO, "read %d, need %d more to read",
+               log(L_INFO, "read %d, need to read %d",
                        cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
                return 0; /* more to read */
        }
                        cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
                return 0; /* more to read */
        }
@@ -1425,11 +1543,56 @@ static int handle_client(int i)
                close_client(i);
                return 0;
        }
                close_client(i);
                return 0;
        }
-       if (ureq->version != NSCD_VERSION) {
-               log(L_INFO, "wrong version");
+       if (unsupported_ureq_type(ureq->type)) {
+               /* We don't know this request. Just close the connection.
+                * (glibc client interprets this like "not supported by this nscd")
+                * Happens very often, thus DEBUG, not INFO */
+               log(L_DEBUG, "unsupported query, dropping");
+               close_client(i);
+               return 0;
+       }
+       srv = type_to_srv[ureq->type];
+       if (!config.srv_enable[srv]) {
+               log(L_INFO, "service %d is disabled, dropping", srv);
                close_client(i);
                return 0;
        }
                close_client(i);
                return 0;
        }
+
+       hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+
+       if (ureq->type == SHUTDOWN
+        || ureq->type == INVALIDATE
+       ) {
+#ifdef SO_PEERCRED
+               struct ucred caller;
+               socklen_t optlen = sizeof(caller);
+               if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
+                       log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
+                       close_client(i);
+                       return 0;
+               }
+               if (caller.uid != 0) {
+                       log(L_INFO, "special request from non-root - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+#endif
+               if (ureq->type == SHUTDOWN) {
+                       log(L_INFO, "got shutdown request, exiting");
+                       exit(0);
+               }
+               if (!ureq->key_len || ureq->reqbuf[ureq->key_len - 1]) {
+                       log(L_INFO, "malformed invalidate request - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+               log(L_INFO, "got invalidate request, flushing cache");
+               /* Frees entire cache. TODO: replace -1 with service (in ureq->reqbuf) */
+               age_cache(/*free_all:*/ 1, -1);
+               close_client(i);
+               return 0;
+       }
+
        if (ureq->type != GETHOSTBYADDR
         && ureq->type != GETHOSTBYADDRv6
        ) {
        if (ureq->type != GETHOSTBYADDR
         && ureq->type != GETHOSTBYADDRv6
        ) {
@@ -1444,104 +1607,167 @@ static int handle_client(int i)
                check_files(srv);
        }
 
                check_files(srv);
        }
 
-       /* If in cache, save ptr to response into cinfo and return */
-       ureq_and_resp = find_cell_and_response(&cell, ureq);
+       cache_pp = lookup_in_cache(ureq);
+       ureq_and_resp = cache_pp ? *cache_pp : NULL;
+
        if (ureq_and_resp) {
        if (ureq_and_resp) {
-               response_header *resp = ureq_response(ureq_and_resp);
-               unsigned sz = resp->version_or_size;
-               log(L_DEBUG, "sz:%u", sz);
-               hex_dump(resp, sz);
-               ureq_and_resp->refcount = 1; /* cache shouldn't free it under us! */
-               pfd[i].events = POLLOUT; /* we want to write out */
-               cinfo[i].resptr = ureq_and_resp;
-               cinfo[i].respos = 0;
-               //cinfo[i].resp_sz = sz;
-               return 1;
+               if (CACHED_ENTRY(ureq_and_resp)) {
+                       /* Found. Save ptr to response into cinfo and return */
+                       response_header *resp = ureq_response(ureq_and_resp);
+                       unsigned sz = resp->version_or_size;
+
+                       log(L_DEBUG, "sz:%u", sz);
+                       hex_dump(resp, sz);
+                       ureq_and_resp->refcount++; /* cache shouldn't free it under us! */
+                       pfd[i].events = POLLOUT; /* we want to write out */
+                       cinfo[i].resptr = ureq_and_resp;
+                       /*cinfo[i].respos = 0; - already is */
+                       /* prevent future matches with anything */
+                       cinfo[i].cache_pp = (void *) 1;
+                       return 1; /* "ready to write data out to client" */
+               }
+
+               /* Not found. Remember a pointer where it will appear */
+               cinfo[i].cache_pp = cache_pp;
+
+               /* If it does not point to our own ureq buffer... */
+               if (CACHE_PTR(ureq_and_resp) != ureq) {
+                       /* We are not the first client who wants this */
+                       log(L_DEBUG, "another request is in progress (%p), waiting for its result", ureq_and_resp);
+                       MARK_PTR_SHARED(cache_pp); /* "please inform us when it's ready" */
+                       /* "we do not wait for client anymore" */
+                       cinfo[i].client_fd = pfd[i].fd;
+                       /* Don't wait on fd. Worker response will unblock us */
+                       pfd[i].events = 0;
+                       return 0;
+               }
+               /* else: lookup_in_cache inserted (ureq & 1) into *cache_pp:
+                * we are the first client to miss on this ureq. */
        }
 
        /* Start worker thread */
        }
 
        /* Start worker thread */
-       cinfo[i].cell = cell;
+       log(L_DEBUG, "stored %p in cache, starting a worker", ureq_and_resp);
        /* Now we will wait on worker's fd, not client's! */
        cinfo[i].client_fd = pfd[i].fd;
        pfd[i].fd = create_and_feed_worker(ureq);
        /* Now we will wait on worker's fd, not client's! */
        cinfo[i].client_fd = pfd[i].fd;
        pfd[i].fd = create_and_feed_worker(ureq);
-
-/* We can do it here, but we don't really need to.
- * We need to have client_buf[] big enough anyway for worst case scenario,
- * so we can simply keep cbuf allocated until we close a client.
-       cinfo[i].ureq = NULL;
-       busy_cbuf[cinfo[i].bufidx] = 0;
- */
        return 0;
 }
 
        return 0;
 }
 
-/* When we return, reply is fully read and stored in cache,
+static void prepare_for_writeout(unsigned i, user_req *cached)
+{
+       log(L_DEBUG2, "client %u: data is ready at %p", i, cached);
+
+       if (cinfo[i].client_fd) {
+               pfd[i].fd = cinfo[i].client_fd;
+               cinfo[i].client_fd = 0; /* "we don't wait for worker reply" */
+       }
+       pfd[i].events = POLLOUT;
+
+       /* Writeout position etc */
+       cinfo[i].resptr = cached;
+       /*cinfo[i].respos = 0; - already is */
+       /* if worker took some time to get info (e.g. DNS query),
+        * prevent client timeout from triggering at once */
+       cinfo[i].started_ms = g_now_ms;
+}
+
+/* Worker seems to be ready to write the response.
+ * When we return, response is fully read and stored in cache,
  * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
  * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
-static void handle_worker_response(int i, unsigned now_ms)
+static void handle_worker_response(int i)
 {
 {
-       response_header sz_and_found;
+       struct { /* struct response_header + small body */
+               uint32_t version_or_size;
+               int32_t found;
+               char body[256 - 8];
+       } sz_and_found;
        user_req *cached;
        user_req *cached;
-       user_req *ureq = cinfo[i].ureq;
+       user_req *ureq;
        response_header *resp;
        response_header *resp;
-       unsigned resp_sz;
-       unsigned ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
-
-       /* Replies can be big (getgrnam("guest") on a big user db),
-        * we cannot rely on them being atomic. However, we know that worker
-        * _always_ gives reply in one full_write(), so loop and read it all
-        * (looping is implemented inside full_read()) */
-       resp_sz = full_read(pfd[i].fd, &sz_and_found, 8);
-       if (resp_sz != 8) {
+       unsigned sz, resp_sz;
+       unsigned ureq_sz_aligned;
+
+       cached = NULL;
+       ureq = cinfo[i].ureq;
+       ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
+
+       sz = full_read(pfd[i].fd, &sz_and_found, sizeof(sz_and_found));
+       if (sz < 8) {
                /* worker was killed? */
                /* worker was killed? */
-               log(L_DEBUG, "worker gave short reply:%u != 8", resp_sz);
+               log(L_DEBUG, "worker gave short reply:%u < 8", sz);
                goto err;
        }
 
        resp_sz = sz_and_found.version_or_size;
                goto err;
        }
 
        resp_sz = sz_and_found.version_or_size;
-       if (resp_sz < 8 || resp_sz > 0xfffffff) { /* 256 mb */
+       if (resp_sz < sz || resp_sz > 0xfffffff) { /* 256 mb */
                error("BUG: bad size from worker:%u", resp_sz);
                goto err;
        }
 
        /* Create new block of cached info */
        cached = xzalloc(ureq_sz_aligned + resp_sz);
                error("BUG: bad size from worker:%u", resp_sz);
                goto err;
        }
 
        /* Create new block of cached info */
        cached = xzalloc(ureq_sz_aligned + resp_sz);
-       resp = (void*) ((char*)cached + ureq_sz_aligned);
+       log(L_DEBUG2, "xzalloc(%u):%p", ureq_sz_aligned + resp_sz, cached);
+       resp = (void*) (((char*) cached) + ureq_sz_aligned);
        memcpy(cached, ureq, ureq_size(ureq));
        memcpy(cached, ureq, ureq_size(ureq));
-       resp->version_or_size = resp_sz;
-       resp->found = sz_and_found.found;
-       if (sz_and_found.found) {
+       memcpy(resp, &sz_and_found, sz);
+       if (sz_and_found.found && resp_sz > sz) {
                /* We need to read data only if it's found
                /* We need to read data only if it's found
-                * (otherwise worker sends only 8 bytes) */
-               if (full_read(pfd[i].fd, resp->body, resp_sz - 8) != resp_sz - 8) {
+                * (otherwise worker sends only 8 bytes).
+                *
+                * Replies can be big (getgrnam("guest") on a big user db),
+                * we cannot rely on them being atomic. However, we know
+                * that worker _always_ gives reply in one full_write(),
+                * so we loop and read it all
+                * (looping is implemented inside full_read())
+                */
+               if (full_read(pfd[i].fd, ((char*) resp) + sz, resp_sz - sz) != resp_sz - sz) {
                        /* worker was killed? */
                        /* worker was killed? */
-                       log(L_DEBUG, "worker gave short reply");
-                       free(cached);
+                       log(L_DEBUG, "worker gave short reply, free(%p)", cached);
  err:
  err:
+                       free(cached);
                        cached = NULL;
                        goto wo;
                }
        }
                        cached = NULL;
                        goto wo;
                }
        }
+       set_cache_timestamp(cached);
        hex_dump(resp, resp_sz);
        hex_dump(resp, resp_sz);
-       /* save in cache */
-       cached->refcount = 1; /* cache shouldn't free it under us! */
-       set_cache_timestamp(cached, now_ms);
-       save_in_cell(cinfo[i].cell, cached, now_ms);
 
  wo:
        close(pfd[i].fd);
 
 
  wo:
        close(pfd[i].fd);
 
-       /* schedule for writeout */
-       pfd[i].fd = cinfo[i].client_fd;
-       cinfo[i].client_fd = 0; /* no, we don't wait for worker reply anymore */
-       pfd[i].events = POLLOUT;
-       /* pfd[i].revents = 0; - not needed? */
+       /* Save in cache */
+       unsigned ref = 0;
+       user_req **cache_pp = cinfo[i].cache_pp;
+       if (cache_pp != NULL) { /* if not a fake entry */
+               ureq = *cache_pp;
+               *cache_pp = cached;
+               if (CACHE_SHARED(ureq)) {
+                       /* Other clients wait for this response too,
+                        * wake them (and us) up and set refcount = no_of_clients */
+                       unsigned j;
+
+                       for (j = 2; j < num_clients; j++) {
+                               if (cinfo[j].cache_pp == cache_pp) {
+                                       /* This client uses the same cache entry */
+                                       ref++;
+                                       /* prevent future matches with anything */
+                                       cinfo[j].cache_pp = (void *) 1;
+                                       prepare_for_writeout(j, cached);
+                               }
+                       }
+                       goto ret;
+               }
+               /* prevent future matches with anything */
+               cinfo[i].cache_pp = (void *) 1;
+               ref = 1;
+       }
 
 
-       /* writeout position etc */
-       cinfo[i].resptr = cached;
-       cinfo[i].respos = 0;
-       //cinfo[i].resp_sz = resp_sz;
-       /* if worker took some time to get info (e.g. DNS query),
-        * prevent client timeout from triggering at once */
-       cinfo[i].started_ms = now_ms;
+       prepare_for_writeout(i, cached);
+ret:
+       /* cache shouldn't free it under us! */
+       if (cached)
+               cached->refcount = ref;
+       aging_interval_ms = min_aging_interval_ms;
 }
 
 static void main_loop(void)
 }
 
 static void main_loop(void)
@@ -1556,26 +1782,32 @@ static void main_loop(void)
        while (1) {
                int i, j;
                int r;
        while (1) {
                int i, j;
                int r;
-               unsigned now_ms;
 
                r = SMALL_POLL_TIMEOUT_MS;
                if (num_clients <= 2 && !cached_cnt)
                        r = -1; /* infinite */
                else if (num_clients < max_reqnum)
                        r = aging_interval_ms;
 
                r = SMALL_POLL_TIMEOUT_MS;
                if (num_clients <= 2 && !cached_cnt)
                        r = -1; /* infinite */
                else if (num_clients < max_reqnum)
                        r = aging_interval_ms;
-
 #if 0 /* Debug: leak detector */
                {
                        static unsigned long long cnt;
 #if 0 /* Debug: leak detector */
                {
                        static unsigned long long cnt;
-                       void *p = malloc(240); /* should not be too small */
+                       static unsigned long low_malloc = -1L;
+                       static unsigned long low_sbrk = -1L;
+                       void *p = malloc(540); /* should not be too small */
                        void *s = sbrk(0);
                        free(p);
                        void *s = sbrk(0);
                        free(p);
-                       log(L_INFO, "entering poll %llu (%d ms). num_clients:%u cached:%u %u/%u next malloc:%p, sbrk:%p",
-                               cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt, p, s);
+                       if ((unsigned long)p < low_malloc)
+                               low_malloc = (unsigned long)p;
+                       if ((unsigned long)s < low_sbrk)
+                               low_sbrk = (unsigned long)s;
+                       log(L_INFO, "poll %llu (%d ms). clients:%u cached:%u %u/%u malloc:%p (%lu), sbrk:%p (%lu)",
+                               cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt,
+                               p, (unsigned long)p - low_malloc,
+                               s, (unsigned long)s - low_sbrk);
                        cnt++;
                }
 #else
                        cnt++;
                }
 #else
-               log(L_DEBUG, "entering poll (%d ms). num_clients:%u cached:%u hit_ratio:%u/%u",
+               log(L_DEBUG, "poll %d ms. clients:%u cached:%u hit ratio:%u/%u",
                                r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
 #endif
 
                                r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
 #endif
 
@@ -1591,7 +1823,7 @@ static void main_loop(void)
                 * There is no blocking I/O (except when we talk to worker thread
                 * which is guaranteed to not block us for long) */
 
                 * There is no blocking I/O (except when we talk to worker thread
                 * which is guaranteed to not block us for long) */
 
-               now_ms = monotonic_ms();
+               g_now_ms = monotonic_ms();
                if (r == 0)
                        goto skip_fd_checks;
 
                if (r == 0)
                        goto skip_fd_checks;
 
@@ -1609,13 +1841,14 @@ static void main_loop(void)
                        ndelay_on(cfd);
                        close_on_exec(cfd);
                        /* x[num_clients] is next free element, taking it */
                        ndelay_on(cfd);
                        close_on_exec(cfd);
                        /* x[num_clients] is next free element, taking it */
+                       log(L_DEBUG2, "new client %d, fd %d", num_clients, cfd);
                        pfd[num_clients].fd = cfd;
                        pfd[num_clients].events = POLLIN;
                        /* this will make us do read() in next for() loop: */
                        pfd[num_clients].revents = POLLIN;
                        memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
                        /* cinfo[num_clients].bytecnt = 0; - done */
                        pfd[num_clients].fd = cfd;
                        pfd[num_clients].events = POLLIN;
                        /* this will make us do read() in next for() loop: */
                        pfd[num_clients].revents = POLLIN;
                        memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
                        /* cinfo[num_clients].bytecnt = 0; - done */
-                       cinfo[num_clients].started_ms = now_ms;
+                       cinfo[num_clients].started_ms = g_now_ms;
                        cinfo[num_clients].bufidx = alloc_buf_no();
                        cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
                        num_clients++;
                        cinfo[num_clients].bufidx = alloc_buf_no();
                        cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
                        num_clients++;
@@ -1644,14 +1877,24 @@ static void main_loop(void)
                                resp = ureq_response(cinfo[i].resptr);
                                resp_sz = resp->version_or_size;
                                resp->version_or_size = NSCD_VERSION;
                                resp = ureq_response(cinfo[i].resptr);
                                resp_sz = resp->version_or_size;
                                resp->version_or_size = NSCD_VERSION;
-                               r = safe_write(pfd[i].fd, resp + cinfo[i].respos, resp_sz - cinfo[i].respos);
+                               r = safe_write(pfd[i].fd, ((char*) resp) + cinfo[i].respos, resp_sz - cinfo[i].respos);
                                resp->version_or_size = resp_sz;
 
                                if (r < 0 && errno == EAGAIN)
                                        continue;
                                if (r <= 0) { /* client isn't there anymore */
                                resp->version_or_size = resp_sz;
 
                                if (r < 0 && errno == EAGAIN)
                                        continue;
                                if (r <= 0) { /* client isn't there anymore */
+                                       log(L_DEBUG, "client %d is gone (write returned %d)", i, r);
  write_out_is_done:
  write_out_is_done:
-                                       free_refcounted_ureq(&cinfo[i].resptr);
+                                       if (cinfo[i].cache_pp == NULL) {
+                                               log(L_DEBUG, "client %d: freeing fake cache entry %p", i, cinfo[i].resptr);
+                                               free(cinfo[i].resptr);
+                                       } else {
+                                               /* Most of the time, it is not freed here,
+                                                * only refcounted--. Freeing happens
+                                                * if it was deleted from cache[] but retained
+                                                * for writeout. */
+                                               free_refcounted_ureq(&cinfo[i].resptr);
+                                       }
                                        close_client(i);
                                        continue;
                                }
                                        close_client(i);
                                        continue;
                                }
@@ -1665,6 +1908,7 @@ static void main_loop(void)
                                         * read(3, "www.google.com\0\0", 16) = 16
                                         * close(3) = 0
                                         */
                                         * read(3, "www.google.com\0\0", 16) = 16
                                         * close(3) = 0
                                         */
+                                       log(L_DEBUG, "client %u: sent answer %u bytes", i, cinfo[i].respos);
                                        goto write_out_is_done;
                                }
                        }
                                        goto write_out_is_done;
                                }
                        }
@@ -1673,15 +1917,37 @@ static void main_loop(void)
                         * already dead, revents may contain other bits too */
                        if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
                                log(L_DEBUG, "reading response for client %u", i);
                         * already dead, revents may contain other bits too */
                        if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
                                log(L_DEBUG, "reading response for client %u", i);
-                               handle_worker_response(i, now_ms);
+                               handle_worker_response(i);
                                /* We can immediately try to write a response
                                 * to client */
                                goto write_out;
                        }
 
                                /* We can immediately try to write a response
                                 * to client */
                                goto write_out;
                        }
 
+                       /* POLLHUP means pfd[i].fd is closed by peer.
+                        * POLLHUP+POLLOUT is seen when we switch for writeout
+                        * and see that pfd[i].fd is closed by peer. */
+                       if ((pfd[i].revents & ~POLLOUT) == POLLHUP) {
+                               int is_client = (cinfo[i].client_fd == 0 || cinfo[i].client_fd == pfd[i].fd);
+                               log(L_INFO, "%s %u disappeared (got POLLHUP on fd %d)",
+                                       is_client ? "client" : "worker",
+                                       i,
+                                       pfd[i].fd
+                               );
+                               if (is_client)
+                                       close_client(i);
+                               else {
+                                       /* Read worker output anyway, error handling
+                                        * in that function deals with short read.
+                                        * Simply closing client is wrong: it leaks
+                                        * shared future entries. */
+                                       handle_worker_response(i);
+                               }
+                               continue;
+                       }
+
                        /* All strange and unexpected cases */
                        if (pfd[i].revents != POLLIN) {
                        /* All strange and unexpected cases */
                        if (pfd[i].revents != POLLIN) {
-                               /* Not just "can read" - prolly POLLHUP too */
+                               /* Not just "can read", but some other bits are there */
                                log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
                                close_client(i);
                                continue;
                                log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
                                close_client(i);
                                continue;
@@ -1703,44 +1969,6 @@ static void main_loop(void)
                        }
                        cinfo[i].bytecnt += r;
                        if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
                        }
                        cinfo[i].bytecnt += r;
                        if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
-                               if (cinfo[i].ureq->type == SHUTDOWN
-                                || cinfo[i].ureq->type == INVALIDATE
-                               ) {
-                                       const char *service;
-                                       unsigned len;
-#ifdef SO_PEERCRED
-                                       struct ucred caller;
-                                       socklen_t optlen = sizeof(caller);
-                                       if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
-                                               log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
-                                               close_client(i);
-                                               continue;
-                                       }
-                                       if (caller.uid != 0) {
-                                               log(L_INFO, "special request from non-root - ignoring");
-                                               close_client(i);
-                                               continue;
-                                       }
-#endif
-                                       if (cinfo[i].ureq->type == SHUTDOWN) {
-                                               log(L_INFO, "got shutdown request, exiting");
-                                               return; /* exits nscd */;
-                                       }
-                                       len = cinfo[i].ureq->key_len;
-                                       service = (char*)&cinfo[i].ureq + len;
-                                       if (sizeof(user_req_header) + len != cinfo[i].bytecnt
-                                        || !len
-                                        || service[len-1] != '\0'
-                                       ) {
-                                               log(L_INFO, "malformed invalidate request - ignoring");
-                                               close_client(i);
-                                               continue;
-                                       }
-                                       log(L_INFO, "got invalidate request, flushing cache");
-                                       age_cache(0, -1); /* frees entire cache. TODO: replace -1 with service */
-                                       close_client(i);
-                                       continue;
-                               }
                                if (handle_client(i)) {
                                        /* Response is found in cache! */
                                        goto write_out;
                                if (handle_client(i)) {
                                        /* Response is found in cache! */
                                        goto write_out;
@@ -1750,17 +1978,19 @@ static void main_loop(void)
 
  skip_fd_checks:
                /* Age cache */
 
  skip_fd_checks:
                /* Age cache */
-               if ((now_ms - last_age_time) >= aging_interval_ms) {
-                       last_age_time = now_ms;
-                       age_cache(now_ms, -1);
+               if ((g_now_ms - last_age_time) >= aging_interval_ms) {
+                       last_age_time = g_now_ms;
+                       age_cache(/*free_all:*/ 0, -1);
                }
 
                /* Close timed out client connections */
                for (i = 2; i < num_clients; i++) {
                }
 
                /* Close timed out client connections */
                for (i = 2; i < num_clients; i++) {
-                       if (pfd[i].fd && !cinfo[i].client_fd
-                        && (now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
+                       if (pfd[i].fd != 0 /* not closed yet? */ ////
+                        && cinfo[i].client_fd == 0 /* do we still wait for client, not worker? */
+                        && (g_now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
                        ) {
                        ) {
-                               log(L_INFO, "timed out waiting for client %u, dropping", i);
+                               log(L_INFO, "timed out waiting for client %u (%u ms), dropping",
+                                       i, (unsigned)(g_now_ms - cinfo[i].started_ms));
                                close_client(i);
                        }
                }
                                close_client(i);
                        }
                }
@@ -1789,7 +2019,7 @@ static void main_loop(void)
 
  coalesce_done:
                num_clients -= cnt_closed;
 
  coalesce_done:
                num_clients -= cnt_closed;
-               log(L_DEBUG, "removing %d closed clients. num_clients:%d", cnt_closed, num_clients);
+               log(L_DEBUG, "removing %d closed clients. clients:%d", cnt_closed, num_clients);
                min_closed = INT_MAX;
                cnt_closed = 0;
                /* start accepting new connects */
                min_closed = INT_MAX;
                cnt_closed = 0;
                /* start accepting new connects */
@@ -1914,7 +2144,7 @@ static void handle_logfile(const char *str, int srv)
 
 static void handle_debuglvl(const char *str, int srv)
 {
 
 static void handle_debuglvl(const char *str, int srv)
 {
-       debug |= getnum(str);
+       debug |= (uint8_t) getnum(str);
 }
 
 static void handle_threads(const char *str, int srv)
 }
 
 static void handle_threads(const char *str, int srv)
@@ -2017,7 +2247,7 @@ static void parse_conffile(const char *conffile, int warn)
                word = conf_words;
                while (1) {
                        if (strcmp(word->str + 1, p) == 0) {
                word = conf_words;
                while (1) {
                        if (strcmp(word->str + 1, p) == 0) {
-                               int srv;
+                               int srv = 0;
                                p = skip_whitespace(p + strlen(p) + 1);
                                *skip_non_whitespace(p) = '\0';
                                if (word->str[0] == 'S') {
                                p = skip_whitespace(p + strlen(p) + 1);
                                *skip_non_whitespace(p) = '\0';
                                if (word->str[0] == 'S') {
@@ -2147,8 +2377,7 @@ static void special_op(const char *arg)
 
        if (!arg) { /* shutdown */
                xfull_write(sock, &ureq, sizeof(ureq));
 
        if (!arg) { /* shutdown */
                xfull_write(sock, &ureq, sizeof(ureq));
-               dup2(2, 1);
-               error_and_die("sent shutdown request, exiting");
+               printf("sent shutdown request, exiting\n");
        } else { /* invalidate */
                size_t arg_len = strlen(arg) + 1;
                struct {
        } else { /* invalidate */
                size_t arg_len = strlen(arg) + 1;
                struct {
@@ -2160,9 +2389,9 @@ static void special_op(const char *arg)
                reqdata.req.key_len = arg_len;
                memcpy(reqdata.arg, arg, arg_len);
                xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
                reqdata.req.key_len = arg_len;
                memcpy(reqdata.arg, arg, arg_len);
                xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
-               dup2(2, 1);
-               error_and_die("sent invalidate(%s) request, exiting", arg);
+               printf("sent invalidate(%s) request, exiting\n", arg);
        }
        }
+       exit(0);
 }
 
 
 }
 
 
@@ -2173,6 +2402,7 @@ void __nss_disable_nscd(void);
 int main(int argc, char **argv)
 {
        int n;
 int main(int argc, char **argv)
 {
        int n;
+       unsigned opt_d_cnt;
        const char *env_U;
        const char *conffile;
 
        const char *env_U;
        const char *conffile;
 
@@ -2180,18 +2410,30 @@ int main(int argc, char **argv)
        __nss_disable_nscd();
 
        if (argv[0][0] == 'w') /* "worker_nscd" */
        __nss_disable_nscd();
 
        if (argv[0][0] == 'w') /* "worker_nscd" */
-               worker();
+               worker(argv[1]);
 
        setlinebuf(stdout);
        setlinebuf(stderr);
 
 
        setlinebuf(stdout);
        setlinebuf(stderr);
 
+       /* Make sure stdio is not closed */
+       n = xopen3("/dev/null", O_RDWR, 0);
+       while (n < 2)
+               n = dup(n);
+       /* Close unexpected open file descriptors */
+       n |= 0xff; /* start from at least fd# 255 */
+       do {
+               close(n--);
+       } while (n > 2);
+
        /* For idiotic kernels which disallow "exec /proc/self/exe" */
        readlink_self_exe();
 
        conffile = default_conffile;
        /* For idiotic kernels which disallow "exec /proc/self/exe" */
        readlink_self_exe();
 
        conffile = default_conffile;
+       opt_d_cnt = 0;
        while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
                switch (n) {
                case 'd':
        while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
                switch (n) {
                case 'd':
+                       opt_d_cnt++;
                        debug &= ~D_DAEMON;
                        break;
                case 'f':
                        debug &= ~D_DAEMON;
                        break;
                case 'f':
@@ -2219,6 +2461,12 @@ int main(int argc, char **argv)
                        print_help_and_die();
                }
        }
                        print_help_and_die();
                }
        }
+       /* Multiple -d can bump debug regardless of nscd.conf:
+        * no -d or -d: 0, -dd: 1,
+        * -ddd: 3, -dddd: 7, -ddddd: 15
+        */
+       if (opt_d_cnt != 0)
+               debug |= (((1U << opt_d_cnt) >> 1) - 1) & L_ALL;
 
        env_U = getenv("U");
        /* Avoid duplicate warnings if $U exists */
 
        env_U = getenv("U");
        /* Avoid duplicate warnings if $U exists */
@@ -2244,37 +2492,27 @@ int main(int argc, char **argv)
                if (putenv(user_to_env_U(config.user)))
                        error_and_die("out of memory");
                /* fds leaked by nss will be closed by execed copy */
                if (putenv(user_to_env_U(config.user)))
                        error_and_die("out of memory");
                /* fds leaked by nss will be closed by execed copy */
-               execv("/proc/self/exe", argv);
-               xexecve(self_exe_points_to, argv, environ);
+               execv(self_exe_points_to, argv);
+               xexecve("/proc/self/exe", argv, environ);
        }
 
        /* Allocate dynamically sized stuff */
        max_reqnum += 2; /* account for 2 first "fake" clients */
        if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
        }
 
        /* Allocate dynamically sized stuff */
        max_reqnum += 2; /* account for 2 first "fake" clients */
        if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
-       if (max_reqnum > 0xffff) max_reqnum = 0xffff;
-       log(L_DEBUG, "will handle %u requests in parallel", max_reqnum - 2);
+       /* Since refcount is a byte, can't serve more than 255-2 clients
+        * at once. The rest will block in connect() */
+       if (max_reqnum > 0xff) max_reqnum = 0xff;
        client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
        busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
        pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
        cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
 
        cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
        client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
        busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
        pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
        cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
 
        cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
-       if (cache_size < 64) cache_size = 64; /* 8*64 = 512 entries min */
+       if (cache_size < 8) cache_size = 8; /* 8*8 = 64 entries min */
        if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
        cache_size |= 1; /* force it to be odd */
        if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
        cache_size |= 1; /* force it to be odd */
-       log(L_DEBUG, "cache size %u x 8 entries", cache_size);
        cache = xzalloc(cache_size * sizeof(cache[0]));
 
        cache = xzalloc(cache_size * sizeof(cache[0]));
 
-       /* Make sure stdio is not closed */
-       n = xopen3("/dev/null", O_RDWR, 0);
-       while (n < 2)
-               n = dup(n);
-       /* Close unexpected open file descriptors */
-       n |= 0xff; /* start from at least fd# 255 */
-       do {
-               close(n--);
-       } while (n > 2);
-
        /* Register cleanup hooks */
        signal(SIGINT, cleanup_on_signal);
        signal(SIGTERM, cleanup_on_signal);
        /* Register cleanup hooks */
        signal(SIGINT, cleanup_on_signal);
        signal(SIGTERM, cleanup_on_signal);
@@ -2287,12 +2525,15 @@ int main(int argc, char **argv)
        signal(SIGALRM, SIG_DFL);
 #endif
 
        signal(SIGALRM, SIG_DFL);
 #endif
 
-       mkdir(NSCD_DIR, 0777);
+       if (mkdir(NSCD_DIR, 0755) == 0) {
+               /* prevent bad mode of NSCD_DIR if umask is e.g. 077 */
+               chmod(NSCD_DIR, 0755);
+       }
        pfd[0].fd = open_socket(NSCD_SOCKET);
        pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
        pfd[0].events = POLLIN;
        pfd[1].events = POLLIN;
        pfd[0].fd = open_socket(NSCD_SOCKET);
        pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
        pfd[0].events = POLLIN;
        pfd[1].events = POLLIN;
-       
+
        if (debug & D_DAEMON) {
                daemon(/*nochdir*/ 1, /*noclose*/ 0);
                if (config.logfile) {
        if (debug & D_DAEMON) {
                daemon(/*nochdir*/ 1, /*noclose*/ 0);
                if (config.logfile) {
@@ -2311,31 +2552,28 @@ int main(int argc, char **argv)
                signal(SIGTSTP, SIG_IGN);
        }
 
                signal(SIGTSTP, SIG_IGN);
        }
 
+       log(L_ALL, "nscd v" PROGRAM_VERSION ", debug level 0x%x", debug & L_ALL);
+       log(L_DEBUG, "max %u requests in parallel", max_reqnum - 2);
+       log(L_DEBUG, "cache size %u x 8 entries", cache_size);
+
        if (env_U) {
                int size;
                gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
        if (env_U) {
                int size;
                gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
-               if (setgroups(size, &ug[1]))
-                       perror_and_die("cannot set groups for user '%s'", config.user);
-               if (setuid(ug[0]))
-                       perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
+               if (size > 1)
+                       if (setgroups(size - 1, &ug[1]) || setgid(ug[1]))
+                               perror_and_die("cannot set groups for user '%s'", config.user);
+               if (size > 0)
+                       if (setuid(ug[0]))
+                               perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
                free(ug);
        }
 
                free(ug);
        }
 
-       log(L_ALL, "nscd v" PROGRAM_VERSION ", debug level %x", debug & L_ALL);
-       log(L_DEBUG, "passwd cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_PASSWD],
-                               config.pttl[SRV_PASSWD],
-                               config.nttl[SRV_PASSWD]);
-       log(L_DEBUG, " group cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_GROUP ],
-                               config.pttl[SRV_GROUP],
-                               config.nttl[SRV_GROUP]);
-       log(L_DEBUG, " hosts cache: %d pttl %u nttl %u",
-                               config.srv_enable[SRV_HOSTS ],
-                               config.pttl[SRV_HOSTS],
-                               config.nttl[SRV_HOSTS]);
-
        for (n = 0; n < 3; n++) {
        for (n = 0; n < 3; n++) {
+               log(L_DEBUG, "%s cache enabled:%u pttl:%u nttl:%u",
+                               srv_name[n],
+                               config.srv_enable[n],
+                               config.pttl[n],
+                               config.nttl[n]);
                config.pttl[n] *= 1000;
                config.nttl[n] *= 1000;
        }
                config.pttl[n] *= 1000;
                config.nttl[n] *= 1000;
        }