Imported Upstream version 0.49

author Don Armstrong <don@donarmstrong.com>

Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)

committer Don Armstrong <don@donarmstrong.com>

Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)
author Don Armstrong <don@donarmstrong.com>
Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)
committer Don Armstrong <don@donarmstrong.com>
Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)
diff --git a/nscd.c b/nscd.c

new file mode 100644 (file)

index 0000000..23f5851
--- /dev/null
+++ b/nscd.c
@@ -0,0 +1,2610 @@
+/* This file is part of unscd, a complete nscd replacement.
+ * Copyright (C) 2007-2012 Denys Vlasenko. Licensed under the GPL version 2.
+ */
+
+/* unscd is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * unscd is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You can download the GNU General Public License from the GNU website
+ * at http://www.gnu.org/ or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
+
+/*
+Build instructions:
+
+gcc -Wall -Wunused-parameter -Os -o nscd nscd.c
+
+gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
+      -Os -o nscd nscd.c
+
+Description:
+
+nscd problems are not exactly unheard of. Over the years, there were
+quite a bit of bugs in it. This leads people to invent babysitters
+which restart crashed/hung nscd. This is ugly.
+
+After looking at nscd source in glibc I arrived to the conclusion
+that its design is contributing to this significantly. Even if nscd's
+code is 100.00% perfect and bug-free, it can still suffer from bugs
+in libraries it calls.
+
+As designed, it's a multithreaded program which calls NSS libraries.
+These libraries are not part of libc, they may be provided
+by third-party projects (samba, ldap, you name it).
+
+Thus nscd cannot be sure that libraries it calls do not have memory
+or file descriptor leaks and other bugs.
+
+Since nscd is multithreaded program with single shared cache,
+any resource leak in any NSS library has cumulative effect.
+Even if a NSS library leaks a file descriptor 0.01% of the time,
+this will make nscd crash or hang after some time.
+
+Of course bugs in NSS .so modules should be fixed, but meanwhile
+I do want nscd which does not crash or lock up.
+
+So I went ahead and wrote a replacement.
+
+It is a single-threaded server process which offloads all NSS
+lookups to worker children (not threads, but fully independent
+processes). Cache hits are handled by parent. Only cache misses
+start worker children. This design is immune against
+resource leaks and hangs in NSS libraries.
+
+It is also many times smaller.
+
+Currently (v0.36) it emulates glibc nscd pretty closely
+(handles same command line flags and config file), and is moderately tested.
+
+Please note that as of 2008-08 it is not in wide use (yet?).
+If you have trouble compiling it, see an incompatibility with
+"standard" one or experience hangs/crashes, please report it to
+vda.linux@googlemail.com
+
+***********************************************************************/
+
+/* Make struct ucred appear in sys/socket.h */
+#define _GNU_SOURCE 1
+/* For all good things */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <time.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <grp.h>
+#include <getopt.h>
+#include <syscall.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <sys/un.h>
+/* For INT_MAX */
+#include <limits.h>
+/* For inet_ntoa (for debug build only) */
+#include <arpa/inet.h>
+
+/*
+ * 0.21 add SEGV reporting to worker
+ * 0.22 don't do freeaddrinfo() in GETAI worker, it's crashy
+ * 0.23 add parameter parsing
+ * 0.24 add conf file parsing, not using results yet
+ * 0.25 used some of conf file settings (not tested)
+ * 0.26 almost all conf file settings are wired up
+ * 0.27 a bit more of almost all conf file settings are wired up
+ * 0.28 optimized cache aging
+ * 0.29 implemented invalidate and shutdown options
+ * 0.30 fixed buglet (sizeof(ptr) != sizeof(array))
+ * 0.31 reduced client_info by one member
+ * 0.32 fix nttl/size defaults; simpler check for worker child in main()
+ * 0.33 tweak includes so that it builds on my new machine (64-bit userspace);
+ *      do not die on unknown service name, just warn
+ *      ("services" is a new service we don't support)
+ * 0.34 create /var/run/nscd/nscd.pid pidfile like glibc nscd 2.8 does;
+ *      delay setuid'ing itself to server-user after log and pidfile are open
+ * 0.35 readlink /proc/self/exe and use result if execing /proc/self/exe fails
+ * 0.36 excercise extreme paranoia handling server-user option;
+ *      a little bit more verbose logging:
+ *      L_DEBUG2 log level added, use debug-level 7 to get it
+ * 0.37 users reported over-zealous "detected change in /etc/passwd",
+ *      apparently stat() returns random garbage in unused padding
+ *      on some systems. Made the check less paranoid.
+ * 0.38 log POLLHUP better
+ * 0.39 log answers to client better, log getpwnam in the worker,
+ *      pass debug level value down to worker.
+ * 0.40   fix handling of shutdown and invalidate requests;
+ *        fix bug with answer written in several pieces
+ * 0.40.1 set hints.ai_socktype = SOCK_STREAM in GETAI request
+ * 0.41   eliminate double caching of two near-simultaneous identical requests -
+ *        EXPERIMENTAL
+ * 0.42   execute /proc/self/exe by link name first (better comm field)
+ * 0.43   fix off-by-one error in setgroups
+ * 0.44   make -d[ddd] bump up debug - easier to explain to users
+ *        how to produce detailed log (no nscd.conf tweaking)
+ * 0.45   Fix out-of-bounds array access and log/pid file permissions -
+ *        thanks to Sebastian Krahmer (krahmer AT suse.de)
+ * 0.46   fix a case when we forgot to remove a future entry on worker failure
+ * 0.47   fix nscd without -d to not bump debug level
+ * 0.48   fix for changes in __nss_disable_nscd API in glibc-2.15
+ * 0.49   minor tweaks to messages
+ */
+#define PROGRAM_VERSION "0.49"
+
+#define DEBUG_BUILD 1
+
+
+/*
+** Generic helpers
+*/
+
+#define ARRAY_SIZE(x) ((unsigned)(sizeof(x) / sizeof((x)[0])))
+
+#define NORETURN __attribute__ ((__noreturn__))
+
+
+#ifdef MY_CPU_HATES_CHARS
+typedef int smallint;
+#else
+typedef signed char smallint;
+#endif
+
+
+enum {
+       L_INFO   = (1 << 0),
+       L_DEBUG  = ((1 << 1) * DEBUG_BUILD),
+       L_DEBUG2 = ((1 << 2) * DEBUG_BUILD),
+       L_DUMP   = ((1 << 3) * DEBUG_BUILD),
+       L_ALL    = 0xf,
+       D_DAEMON = (1 << 6),
+       D_STAMP  = (1 << 5),
+};
+
+static smallint debug = D_DAEMON;
+
+static void verror(const char *s, va_list p, const char *strerr)
+{
+       char msgbuf[1024];
+       int sz, rem, strerr_len;
+       struct timeval tv;
+
+       sz = 0;
+       if (debug & D_STAMP) {
+               gettimeofday(&tv, NULL);
+               sz = sprintf(msgbuf, "%02u:%02u:%02u.%05u ",
+                       (unsigned)((tv.tv_sec / (60*60)) % 24),
+                       (unsigned)((tv.tv_sec / 60) % 60),
+                       (unsigned)(tv.tv_sec % 60),
+                       (unsigned)(tv.tv_usec / 10));
+       }
+       rem = sizeof(msgbuf) - sz;
+       sz += vsnprintf(msgbuf + sz, rem, s, p);
+       rem = sizeof(msgbuf) - sz; /* can be negative after this! */
+
+       if (strerr) {
+               strerr_len = strlen(strerr);
+               if (rem >= strerr_len + 4) { /* ": STRERR\n\0" */
+                       msgbuf[sz++] = ':';
+                       msgbuf[sz++] = ' ';
+                       strcpy(msgbuf + sz, strerr);
+                       sz += strerr_len;
+               }
+       }
+       if (rem >= 2) {
+               msgbuf[sz++] = '\n';
+               msgbuf[sz] = '\0';
+       }
+       fflush(NULL);
+       fputs(msgbuf, stderr);
+}
+
+static void error(const char *msg, ...)
+{
+       va_list p;
+       va_start(p, msg);
+       verror(msg, p, NULL);
+       va_end(p);
+}
+
+static void error_and_die(const char *msg, ...) NORETURN;
+static void error_and_die(const char *msg, ...)
+{
+       va_list p;
+       va_start(p, msg);
+       verror(msg, p, NULL);
+       va_end(p);
+       _exit(1);
+}
+
+static void perror_and_die(const char *msg, ...) NORETURN;
+static void perror_and_die(const char *msg, ...)
+{
+       va_list p;
+       va_start(p, msg);
+       /* Guard against "<error message>: Success" */
+       verror(msg, p, errno ? strerror(errno) : NULL);
+       va_end(p);
+       _exit(1);
+}
+
+static void nscd_log(int mask, const char *msg, ...)
+{
+       if (debug & mask) {
+               va_list p;
+               va_start(p, msg);
+               verror(msg, p, NULL);
+               va_end(p);
+       }
+}
+
+#define log(lvl, ...) do { if (lvl) nscd_log(lvl, __VA_ARGS__); } while (0)
+
+#if DEBUG_BUILD
+static void dump(const void *ptr, int len)
+{
+       char text[18];
+       const unsigned char *buf;
+       char *p;
+
+       if (!(debug & L_DUMP))
+               return;
+
+       buf = ptr;
+       while (len > 0) {
+               int chunk = ((len >= 16) ? 16 : len);
+               fprintf(stderr,
+                       "%02x %02x %02x %02x %02x %02x %02x %02x "
+                       "%02x %02x %02x %02x %02x %02x %02x %02x " + (16-chunk) * 5,
+                       buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
+                       buf[8], buf[9],buf[10],buf[11],buf[12],buf[13],buf[14],buf[15]
+               );
+               fprintf(stderr, "%*s", (16-chunk) * 3, "");
+               len -= chunk;
+               p = text;
+               do {
+                       unsigned char c = *buf++;
+                       *p++ = (c >= 32 && c < 127 ? c : '.');
+               } while (--chunk);
+               *p++ = '\n';
+               *p = '\0';
+               fputs(text, stderr);
+       }
+}
+#else
+void dump(const void *ptr, int len);
+#endif
+
+#define hex_dump(p,n) do { if (L_DUMP) dump(p,n); } while (0)
+
+static int xopen3(const char *pathname, int flags, int mode)
+{
+       int fd = open(pathname, flags, mode);
+       if (fd < 0)
+               perror_and_die("open");
+       return fd;
+}
+
+static void xpipe(int *fds)
+{
+       if (pipe(fds) < 0)
+               perror_and_die("pipe");
+}
+
+static void xexecve(const char *filename, char **argv, char **envp) NORETURN;
+static void xexecve(const char *filename, char **argv, char **envp)
+{
+       execve(filename, argv, envp);
+       perror_and_die("cannot re-exec %s", filename);
+}
+
+static void ndelay_on(int fd)
+{
+       int fl = fcntl(fd, F_GETFL);
+       if (fl < 0)
+               perror_and_die("F_GETFL");
+       if (fcntl(fd, F_SETFL, fl | O_NONBLOCK) < 0)
+               perror_and_die("setting O_NONBLOCK");
+}
+
+static void close_on_exec(int fd)
+{
+       if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+               perror_and_die("setting FD_CLOEXEC");
+}
+
+static unsigned monotonic_ms(void)
+{
+       struct timespec ts;
+       if (syscall(__NR_clock_gettime, CLOCK_MONOTONIC, &ts))
+               perror_and_die("clock_gettime(MONOTONIC)");
+       return ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
+}
+
+static unsigned strsize(const char *str)
+{
+       return strlen(str) + 1;
+}
+
+static unsigned strsize_aligned4(const char *str)
+{
+       return (strlen(str) + 1 + 3) & (~3);
+}
+
+static ssize_t safe_read(int fd, void *buf, size_t count)
+{
+       ssize_t n;
+       do {
+               n = read(fd, buf, count);
+       } while (n < 0 && errno == EINTR);
+       return n;
+}
+
+static ssize_t full_read(int fd, void *buf, size_t len)
+{
+       ssize_t cc;
+       ssize_t total;
+       total = 0;
+       while (len) {
+               cc = safe_read(fd, buf, len);
+               if (cc < 0)
+                       return cc;      /* read() returns -1 on failure. */
+               if (cc == 0)
+                       break;
+               buf = ((char *)buf) + cc;
+               total += cc;
+               len -= cc;
+       }
+       return total;
+}
+
+/* unused
+static void xsafe_read(int fd, void *buf, size_t len)
+{
+       if (len != safe_read(fd, buf, len))
+               perror_and_die("short read");
+}
+static void xfull_read(int fd, void *buf, size_t len)
+{
+       if (len != full_read(fd, buf, len))
+               perror_and_die("short read");
+}
+*/
+
+static ssize_t safe_write(int fd, const void *buf, size_t count)
+{
+       ssize_t n;
+       do {
+               n = write(fd, buf, count);
+       } while (n < 0 && errno == EINTR);
+       return n;
+}
+
+static ssize_t full_write(int fd, const void *buf, size_t len)
+{
+       ssize_t cc;
+       ssize_t total;
+
+       total = 0;
+       while (len) {
+               cc = safe_write(fd, buf, len);
+               if (cc < 0)
+                       return cc;      /* write() returns -1 on failure. */
+               total += cc;
+               buf = ((const char *)buf) + cc;
+               len -= cc;
+       }
+       return total;
+}
+
+static void xsafe_write(int fd, const void *buf, size_t count)
+{
+       if (count != safe_write(fd, buf, count))
+               perror_and_die("short write of %ld bytes", (long)count);
+}
+static void xfull_write(int fd, const void *buf, size_t count)
+{
+       if (count != full_write(fd, buf, count))
+               perror_and_die("short write of %ld bytes", (long)count);
+}
+
+static void xmovefd(int from_fd, int to_fd)
+{
+       if (from_fd != to_fd) {
+               if (dup2(from_fd, to_fd) < 0)
+                       perror_and_die("dup2");
+               close(from_fd);
+       }
+}
+
+static unsigned getnum(const char *str)
+{
+       if (str[0] >= '0' && str[0] <= '9') {
+               char *p;
+               unsigned long l = strtoul(str, &p, 10);
+               /* must not overflow int even after x1000 */
+               if (!*p && l <= INT_MAX / 1000)
+                       return l;
+       }
+       error_and_die("malformed or too big number '%s'", str);
+};
+
+static char *skip_whitespace(const char *s)
+{
+       /* NB: isspace('\0') returns 0 */
+       while (isspace(*s)) ++s;
+       return (char *) s;
+}
+
+static char *skip_non_whitespace(const char *s)
+{
+       while (*s && !isspace(*s)) ++s;
+       return (char *) s;
+}
+
+static void *xmalloc(unsigned sz)
+{
+       void *p = malloc(sz);
+       if (!p)
+               error_and_die("out of memory");
+       return p;
+}
+
+static void *xzalloc(unsigned sz)
+{
+       void *p = xmalloc(sz);
+       memset(p, 0, sz);
+       return p;
+}
+
+static void *xrealloc(void *p, unsigned size)
+{
+       p = realloc(p, size);
+       if (!p)
+               error_and_die("out of memory");
+       return p;
+}
+
+static const char *xstrdup(const char *str)
+{
+       const char *p = strdup(str);
+       if (!p)
+               error_and_die("out of memory");
+       return p;
+}
+
+
+/*
+** Config data
+*/
+
+enum {
+       SRV_PASSWD,
+       SRV_GROUP,
+       SRV_HOSTS,
+};
+
+static const char srv_name[3][7] = {
+       "passwd",
+       "group",
+       "hosts"
+};
+
+static struct {
+       const char *logfile;
+       const char *user;
+       smallint srv_enable[3];
+       smallint check_files[3];
+       unsigned pttl[3];
+       unsigned nttl[3];
+       unsigned size[3];
+} config = {
+       /* We try to closely mimic glibc nscd */
+       .logfile     = NULL, /* default is to not have a log file */
+       .user        = NULL,
+       .srv_enable  = { 0, 0, 0 },
+       .check_files = { 1, 1, 1 },
+       .pttl        = { 3600, 3600, 3600 },
+       .nttl        = { 20, 60, 20 },
+       /* huh, what is the default cache size in glibc nscd? */
+       .size        = { 256 * 8 / 3, 256 * 8 / 3, 256 * 8 / 3 },
+};
+
+static const char default_conffile[] = "/etc/nscd.conf";
+static const char *self_exe_points_to = "/proc/self/exe";
+
+
+/*
+** Clients, workers machinery
+*/
+
+/* Header common to all requests */
+#define USER_REQ_STRUCT \
+       uint32_t version; /* Version number of the daemon interface */ \
+       uint32_t type;    /* Service requested */ \
+       uint32_t key_len; /* Key length */
+
+typedef struct user_req_header {
+       USER_REQ_STRUCT
+} user_req_header;
+
+enum {
+       NSCD_VERSION = 2,
+       MAX_USER_REQ_SIZE = 1024,
+       USER_HDR_SIZE = sizeof(user_req_header),
+       /* DNS queries time out after 20 seconds,
+        * we will allow for a bit more */
+       WORKER_TIMEOUT_SEC = 30,
+       CLIENT_TIMEOUT_MS = 100,
+       SMALL_POLL_TIMEOUT_MS = 200,
+};
+
+typedef struct user_req {
+       union {
+               struct { /* as came from client */
+                       USER_REQ_STRUCT
+               };
+               struct { /* when stored in cache, overlaps .version */
+                       unsigned refcount:8;
+                       /* (timestamp24 * 256) == timestamp in ms */
+                       unsigned timestamp24:24;
+               };
+       };
+       char reqbuf[MAX_USER_REQ_SIZE - USER_HDR_SIZE];
+} user_req;
+
+/* Compile-time check for correct size */
+struct BUG_wrong_user_req_size {
+       char BUG_wrong_user_req_size[sizeof(user_req) == MAX_USER_REQ_SIZE ? 1 : -1];
+};
+
+enum {
+       GETPWBYNAME,
+       GETPWBYUID,
+       GETGRBYNAME,
+       GETGRBYGID,
+       GETHOSTBYNAME,
+       GETHOSTBYNAMEv6,
+       GETHOSTBYADDR,
+       GETHOSTBYADDRv6,
+       SHUTDOWN,               /* Shut the server down */
+       GETSTAT,                /* Get the server statistic */
+       INVALIDATE,             /* Invalidate one special cache */
+       GETFDPW,
+       GETFDGR,
+       GETFDHST,
+       GETAI,
+       INITGROUPS,
+       GETSERVBYNAME,
+       GETSERVBYPORT,
+       GETFDSERV,
+       LASTREQ
+};
+#if DEBUG_BUILD
+static const char *const typestr[] = {
+       "GETPWBYNAME",     /* done */
+       "GETPWBYUID",      /* done */
+       "GETGRBYNAME",     /* done */
+       "GETGRBYGID",      /* done */
+       "GETHOSTBYNAME",   /* done */
+       "GETHOSTBYNAMEv6", /* done */
+       "GETHOSTBYADDR",   /* done */
+       "GETHOSTBYADDRv6", /* done */
+       "SHUTDOWN",        /* done */
+       "GETSTAT",         /* info? */
+       "INVALIDATE",      /* done */
+       /* won't do: nscd passes a name of shmem segment
+        * which client can map and "see" the db */
+       "GETFDPW",
+       "GETFDGR",         /* won't do */
+       "GETFDHST",        /* won't do */
+       "GETAI",           /* done */
+       "INITGROUPS",      /* done */
+       "GETSERVBYNAME",   /* prio 3 (no caching?) */
+       "GETSERVBYPORT",   /* prio 3 (no caching?) */
+       "GETFDSERV"        /* won't do */
+};
+#else
+extern const char *const typestr[];
+#endif
+static const smallint type_to_srv[] = {
+       [GETPWBYNAME     ] = SRV_PASSWD,
+       [GETPWBYUID      ] = SRV_PASSWD,
+       [GETGRBYNAME     ] = SRV_GROUP,
+       [GETGRBYGID      ] = SRV_GROUP,
+       [GETHOSTBYNAME   ] = SRV_HOSTS,
+       [GETHOSTBYNAMEv6 ] = SRV_HOSTS,
+       [GETHOSTBYADDR   ] = SRV_HOSTS,
+       [GETHOSTBYADDRv6 ] = SRV_HOSTS,
+       [GETAI           ] = SRV_HOSTS,
+       [INITGROUPS      ] = SRV_GROUP,
+};
+
+static int unsupported_ureq_type(unsigned type)
+{
+       if (type == GETAI) return 0;
+       if (type == INITGROUPS) return 0;
+       if (type == GETSTAT) return 1;
+       if (type > INVALIDATE) return 1;
+       return 0;
+}
+
+
+typedef struct client_info {
+       /* if client_fd != 0, we are waiting for the reply from worker
+        * on pfd[i].fd, and client_fd is saved client's fd
+        * (we need to put it back into pfd[i].fd later) */
+       int client_fd;
+       unsigned bytecnt;       /* bytes read from client */
+       unsigned bufidx;        /* buffer# in global client_buf[] */
+       unsigned started_ms;
+       unsigned respos;        /* response */
+       user_req *resptr;       /* response */
+       user_req **cache_pp;    /* cache entry address */
+       user_req *ureq;         /* request (points to client_buf[x]) */
+} client_info;
+
+static unsigned g_now_ms;
+static int min_closed = INT_MAX;
+static int cnt_closed = 0;
+static int num_clients = 2; /* two listening sockets are "clients" too */
+
+/* We read up to max_reqnum requests in parallel */
+static unsigned max_reqnum = 14;
+static int next_buf;
+static char          (*client_buf)[MAX_USER_REQ_SIZE];
+static char          *busy_cbuf;
+static struct pollfd *pfd;
+static client_info   *cinfo;
+
+/* Request, response and cache data structures:
+ *
+ * cache[] (defined later):
+ * cacheline_t cache[cache_size] array, or in other words,
+ * user_req* cache[cache_size][8] array.
+ * Every client request is hashed, hash value determines which cache[x]
+ * will have the response stored in one of its 8 elements.
+ * Cache entries have this format: request, then padding to 32 bits,
+ * then the response.
+ * Addresses in cache[x][y] may be NULL or:
+ * (&client_buf[z]) & 1: the cache miss is in progress ("future entry"):
+ * "the data is not in the cache (yet), wait for it to appear"
+ * (&client_buf[z]) & 3: the cache miss is in progress and other clients
+ * also want the same data ("shared future entry")
+ * else (non-NULL but low two bits are 0): cached data in malloc'ed block
+ *
+ * Each of these is a [max_reqnum] sized array:
+ * pfd[i] - given to poll() to wait for requests and replies.
+ *      .fd: first two pfd[i]: listening Unix domain sockets, else
+ *      .fd: open fd to a client, for reading client's request, or
+ *      .fd: open fd to a worker, to send request and get response back
+ * cinfo[i] - auxiliary client data for pfd[i]
+ *      .client_fd: open fd to a client, in case we already had read its
+ *          request and got a cache miss, and created a worker or
+ *          wait for another client's worker.
+ *          Otherwise, it's 0 and client's fd is in pfd[i].fd
+ *      .bufidx: index in client_buf[] we store client's request in
+ *      .bytecnt: size of the request
+ *      .started_ms: used to time out unresponsive clients
+ *      .respos:
+ *      .resptr:
+ *      .cache_pp: &cache[x][y] where the response is, or will be stored.
+ *      .ureq:
+ * When a client has received its reply (or otherwise closed (timeout etc)),
+ * corresponding pfd[i] and cinfo[i] are removed by shifting [i+1], [i+2] etc
+ * elements down, so that both arrays never have free holes.
+ * [num_clients] is always the first free element.
+ *
+ * Each of these also is a [max_reqnum] sized array, but indexes
+ * do not correspond directly to pfd[i] and cinfo[i]:
+ * client_buf[n][MAX_USER_REQ_SIZE] - buffers we read client requests into
+ * busy_cbuf[n] - bool flags marking busy client_buf[]
+ */
+/* Possible reductions:
+ * fd, bufidx - uint8_t
+ * started_ms -> uint16_t started_s
+ * ureq - eliminate (derivable from bufidx?)
+ */
+
+/* Are special bits 0? is it a true cached entry? */
+#define CACHED_ENTRY(p)     ( ((long)(p) & 3) == 0 )
+/* Are special bits 11? is it a shared future cache entry? */
+#define CACHE_SHARED(p)     ( ((long)(p) & 3) == 3 )
+/* Return a ptr with special bits cleared (used for accessing data) */
+#define CACHE_PTR(p)        ( (void*) ((long)(p) & ~(long)3) )
+/* Return a ptr with special bits set to x1: make future cache entry ptr */
+#define MAKE_FUTURE_PTR(p)  ( (void*) ((long)(p) | 1) )
+/* Modify ptr, set special bits to 11: shared future cache entry */
+#define MARK_PTR_SHARED(pp) ( *(long*)(pp) |= 3 )
+
+static inline unsigned ureq_size(const user_req *ureq)
+{
+       return sizeof(user_req_header) + ureq->key_len;
+}
+
+static unsigned cache_age(const user_req *ureq)
+{
+       if (!CACHED_ENTRY(ureq))
+               return 0;
+       return (uint32_t) (g_now_ms - (ureq->timestamp24 << 8));
+}
+
+static void set_cache_timestamp(user_req *ureq)
+{
+       ureq->timestamp24 = g_now_ms >> 8;
+}
+
+static int alloc_buf_no(void)
+{
+       int n = next_buf;
+       do {
+               int cur = next_buf;
+               next_buf = (next_buf + 1) % max_reqnum;
+               if (!busy_cbuf[cur]) {
+                       busy_cbuf[cur] = 1;
+                       return cur;
+               }
+       } while (next_buf != n);
+       error_and_die("no free bufs?!");
+}
+
+static inline void *bufno2buf(int i)
+{
+       return client_buf[i];
+}
+
+static void close_client(unsigned i)
+{
+       log(L_DEBUG, "closing client %u (fd %u,%u)", i, pfd[i].fd, cinfo[i].client_fd);
+       /* Paranoia. We had nasty bugs where client was closed twice. */
+       if (pfd[i].fd == 0) ////
+               return;
+       close(pfd[i].fd);
+       if (cinfo[i].client_fd && cinfo[i].client_fd != pfd[i].fd)
+               close(cinfo[i].client_fd);
+       pfd[i].fd = 0; /* flag as unused (coalescing needs this) */
+       busy_cbuf[cinfo[i].bufidx] = 0;
+       cnt_closed++;
+       if (i < min_closed)
+               min_closed = i;
+}
+
+
+/*
+** nscd API <-> C API conversion
+*/
+
+typedef struct response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       char body[0];
+} response_header;
+
+typedef struct initgr_response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       int32_t ngrps;
+       /* code assumes gid_t == int32, let's check that */
+       int32_t gid[sizeof(gid_t) == sizeof(int32_t) ? 0 : -1];
+       /* char user_str[as_needed]; */
+} initgr_response_header;
+
+static initgr_response_header *obtain_initgroups(const char *username)
+{
+       struct initgr_response_header *resp;
+       struct passwd *pw;
+       enum { MAGIC_OFFSET = sizeof(*resp) / sizeof(int32_t) };
+       unsigned sz;
+       int ngroups;
+
+       pw = getpwnam(username);
+       if (!pw) {
+               resp = xzalloc(8);
+               resp->version_or_size = sizeof(*resp);
+               /*resp->found = 0;*/
+               /*resp->ngrps = 0;*/
+               goto ret;
+       }
+
+       /* getgrouplist may be very expensive, it's much better to allocate
+        * a bit more than to run getgrouplist twice */
+       ngroups = 128;
+       resp = NULL;
+       do {
+               sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
+               resp = xrealloc(resp, sz);
+       } while (getgrouplist(username, pw->pw_gid, (gid_t*) &resp->gid, &ngroups) == -1);
+       log(L_DEBUG, "ngroups=%d", ngroups);
+
+       sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
+       /* resp = xrealloc(resp, sz); - why bother */
+       resp->version_or_size = sz;
+       resp->found = 1;
+       resp->ngrps = ngroups;
+ ret:
+       return resp;
+}
+
+typedef struct pw_response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       int32_t pw_name_len;
+       int32_t pw_passwd_len;
+       int32_t pw_uid;
+       int32_t pw_gid;
+       int32_t pw_gecos_len;
+       int32_t pw_dir_len;
+       int32_t pw_shell_len;
+       /* char pw_name[pw_name_len]; */
+       /* char pw_passwd[pw_passwd_len]; */
+       /* char pw_gecos[pw_gecos_len]; */
+       /* char pw_dir[pw_dir_len]; */
+       /* char pw_shell[pw_shell_len]; */
+} pw_response_header;
+
+static pw_response_header *marshal_passwd(struct passwd *pw)
+{
+       char *p;
+       pw_response_header *resp;
+       unsigned pw_name_len;
+       unsigned pw_passwd_len;
+       unsigned pw_gecos_len;
+       unsigned pw_dir_len;
+       unsigned pw_shell_len;
+       unsigned sz = sizeof(*resp);
+       if (pw) {
+               sz += (pw_name_len = strsize(pw->pw_name));
+               sz += (pw_passwd_len = strsize(pw->pw_passwd));
+               sz += (pw_gecos_len = strsize(pw->pw_gecos));
+               sz += (pw_dir_len = strsize(pw->pw_dir));
+               sz += (pw_shell_len = strsize(pw->pw_shell));
+       }
+       resp = xzalloc(sz);
+       resp->version_or_size = sz;
+       if (!pw) {
+               /*resp->found = 0;*/
+               goto ret;
+       }
+       resp->found = 1;
+       resp->pw_name_len = pw_name_len;
+       resp->pw_passwd_len = pw_passwd_len;
+       resp->pw_uid = pw->pw_uid;
+       resp->pw_gid = pw->pw_gid;
+       resp->pw_gecos_len = pw_gecos_len;
+       resp->pw_dir_len = pw_dir_len;
+       resp->pw_shell_len = pw_shell_len;
+       p = (char*)(resp + 1);
+       strcpy(p, pw->pw_name); p += pw_name_len;
+       strcpy(p, pw->pw_passwd); p += pw_passwd_len;
+       strcpy(p, pw->pw_gecos); p += pw_gecos_len;
+       strcpy(p, pw->pw_dir); p += pw_dir_len;
+       strcpy(p, pw->pw_shell); p += pw_shell_len;
+       log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+       return resp;
+}
+
+typedef struct gr_response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       int32_t gr_name_len;    /* strlen(gr->gr_name) + 1; */
+       int32_t gr_passwd_len;  /* strlen(gr->gr_passwd) + 1; */
+       int32_t gr_gid;         /* gr->gr_gid */
+       int32_t gr_mem_cnt;     /* while (gr->gr_mem[gr_mem_cnt]) ++gr_mem_cnt; */
+       /* int32_t gr_mem_len[gr_mem_cnt]; */
+       /* char gr_name[gr_name_len]; */
+       /* char gr_passwd[gr_passwd_len]; */
+       /* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
+       /* char gr_gid_str[as_needed]; - huh? */
+       /* char orig_key[as_needed]; - needed?? I don't do this ATM... */
+/*
+ glibc adds gr_gid_str, but client doesn't get/use it:
+ writev(3, [{"\2\0\0\0\2\0\0\0\5\0\0\0", 12}, {"root\0", 5}], 2) = 17
+ poll([{fd=3, events=POLLIN|POLLERR|POLLHUP, revents=POLLIN}], 1, 5000) = 1
+ read(3, "\2\0\0\0\1\0\0\0\10\0\0\0\4\0\0\0\0\0\0\0\0\0\0\0", 24) = 24
+ readv(3, [{"", 0}, {"root\0\0\0\0\0\0\0\0", 12}], 2) = 12
+ read(3, NULL, 0)        = 0
+*/
+} gr_response_header;
+
+static gr_response_header *marshal_group(struct group *gr)
+{
+       char *p;
+       gr_response_header *resp;
+       unsigned gr_mem_cnt;
+       unsigned sz = sizeof(*resp);
+       if (gr) {
+               sz += strsize(gr->gr_name);
+               sz += strsize(gr->gr_passwd);
+               gr_mem_cnt = 0;
+               while (gr->gr_mem[gr_mem_cnt]) {
+                       sz += strsize(gr->gr_mem[gr_mem_cnt]);
+                       gr_mem_cnt++;
+               }
+               /* for int32_t gr_mem_len[gr_mem_cnt]; */
+               sz += gr_mem_cnt * sizeof(int32_t);
+       }
+       resp = xzalloc(sz);
+       resp->version_or_size = sz;
+       if (!gr) {
+               /*resp->found = 0;*/
+               goto ret;
+       }
+       resp->found = 1;
+       resp->gr_name_len = strsize(gr->gr_name);
+       resp->gr_passwd_len = strsize(gr->gr_passwd);
+       resp->gr_gid = gr->gr_gid;
+       resp->gr_mem_cnt = gr_mem_cnt;
+       p = (char*)(resp + 1);
+/* int32_t gr_mem_len[gr_mem_cnt]; */
+       gr_mem_cnt = 0;
+       while (gr->gr_mem[gr_mem_cnt]) {
+               *(uint32_t*)p = strsize(gr->gr_mem[gr_mem_cnt]);
+               p += 4;
+               gr_mem_cnt++;
+       }
+/* char gr_name[gr_name_len]; */
+       strcpy(p, gr->gr_name);
+       p += strsize(gr->gr_name);
+/* char gr_passwd[gr_passwd_len]; */
+       strcpy(p, gr->gr_passwd);
+       p += strsize(gr->gr_passwd);
+/* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
+       gr_mem_cnt = 0;
+       while (gr->gr_mem[gr_mem_cnt]) {
+               strcpy(p, gr->gr_mem[gr_mem_cnt]);
+               p += strsize(gr->gr_mem[gr_mem_cnt]);
+               gr_mem_cnt++;
+       }
+       log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+       return resp;
+}
+
+typedef struct hst_response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       int32_t h_name_len;
+       int32_t h_aliases_cnt;
+       int32_t h_addrtype;     /* AF_INET or AF_INET6 */
+       int32_t h_length;       /* 4 or 16 */
+       int32_t h_addr_list_cnt;
+       int32_t error;
+       /* char h_name[h_name_len]; - we pad it to 4 bytes */
+       /* uint32_t h_aliases_len[h_aliases_cnt]; */
+       /* char h_addr_list[h_addr_list_cnt][h_length]; - every one is the same size [h_length] (4 or 16) */
+       /* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
+} hst_response_header;
+
+static hst_response_header *marshal_hostent(struct hostent *h)
+{
+       char *p;
+       hst_response_header *resp;
+       unsigned h_name_len;
+       unsigned h_aliases_cnt;
+       unsigned h_addr_list_cnt;
+       unsigned sz = sizeof(*resp);
+       if (h) {
+/* char h_name[h_name_len] */
+               sz += h_name_len = strsize_aligned4(h->h_name);
+               h_addr_list_cnt = 0;
+               while (h->h_addr_list[h_addr_list_cnt]) {
+                       h_addr_list_cnt++;
+               }
+/* char h_addr_list[h_addr_list_cnt][h_length] */
+               sz += h_addr_list_cnt * h->h_length;
+               h_aliases_cnt = 0;
+               while (h->h_aliases[h_aliases_cnt]) {
+/* char h_aliases[h_aliases_cnt][h_aliases_len[i]] */
+                       sz += strsize(h->h_aliases[h_aliases_cnt]);
+                       h_aliases_cnt++;
+               }
+/* uint32_t h_aliases_len[h_aliases_cnt] */
+               sz += h_aliases_cnt * 4;
+       }
+       resp = xzalloc(sz);
+       resp->version_or_size = sz;
+       if (!h) {
+               /*resp->found = 0;*/
+               resp->error = HOST_NOT_FOUND;
+               goto ret;
+       }
+       resp->found = 1;
+       resp->h_name_len = h_name_len;
+       resp->h_aliases_cnt = h_aliases_cnt;
+       resp->h_addrtype = h->h_addrtype;
+       resp->h_length = h->h_length;
+       resp->h_addr_list_cnt = h_addr_list_cnt;
+       /*resp->error = 0;*/
+       p = (char*)(resp + 1);
+/* char h_name[h_name_len]; */
+       strcpy(p, h->h_name);
+       p += h_name_len;
+/* uint32_t h_aliases_len[h_aliases_cnt]; */
+       h_aliases_cnt = 0;
+       while (h->h_aliases[h_aliases_cnt]) {
+               *(uint32_t*)p = strsize(h->h_aliases[h_aliases_cnt]);
+               p += 4;
+               h_aliases_cnt++;
+       }
+/* char h_addr_list[h_addr_list_cnt][h_length]; */
+       h_addr_list_cnt = 0;
+       while (h->h_addr_list[h_addr_list_cnt]) {
+               memcpy(p, h->h_addr_list[h_addr_list_cnt], h->h_length);
+               p += h->h_length;
+               h_addr_list_cnt++;
+       }
+/* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
+       h_aliases_cnt = 0;
+       while (h->h_aliases[h_aliases_cnt]) {
+               strcpy(p, h->h_aliases[h_aliases_cnt]);
+               p += strsize(h->h_aliases[h_aliases_cnt]);
+               h_aliases_cnt++;
+       }
+       log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+       return resp;
+}
+
+/* Reply to addrinfo query */
+typedef struct ai_response_header {
+       uint32_t version_or_size;
+       int32_t found;
+       int32_t naddrs;
+       int32_t addrslen;
+       int32_t canonlen;
+       int32_t error;
+       /* char ai_addr[naddrs][4 or 16]; - addrslen bytes in total */
+       /* char ai_family[naddrs]; - AF_INET[6] each (determines ai_addr[i] length) */
+       /* char ai_canonname[canonlen]; */
+} ai_response_header;
+
+static ai_response_header *obtain_addrinfo(const char *hostname)
+{
+       struct addrinfo hints;
+       struct addrinfo *ai;
+       struct addrinfo *ap;
+       ai_response_header *resp;
+       char *p, *family;
+       int err;
+       unsigned sz;
+       unsigned naddrs = 0;
+       unsigned addrslen = 0;
+       unsigned canonlen = 0;
+
+       memset(&hints, 0, sizeof(hints));
+       hints.ai_flags = AI_CANONNAME;
+       /* kills dups (one for each possible SOCK_xxx) */
+       /* this matches glibc behavior */
+       hints.ai_socktype = SOCK_STREAM;
+       ai = NULL; /* on failure getaddrinfo may leave it as-is */
+       err = getaddrinfo(hostname, NULL, &hints, &ai);
+
+       sz = sizeof(*resp);
+       if (!err) {
+               if (ai->ai_canonname)
+                       sz += canonlen = strsize(ai->ai_canonname);
+               ap = ai;
+               do {
+                       naddrs++;
+                       addrslen += (ap->ai_family == AF_INET ? 4 : 16);
+                       ap = ap->ai_next;
+               } while (ap);
+               sz += naddrs + addrslen;
+       }
+       resp = xzalloc(sz);
+       resp->version_or_size = sz;
+       resp->error = err;
+       if (err) {
+               /*resp->found = 0;*/
+               goto ret;
+       }
+       resp->found = 1;
+       resp->naddrs = naddrs;
+       resp->addrslen = addrslen;
+       resp->canonlen = canonlen;
+       p = (char*)(resp + 1);
+       family = p + addrslen;
+       ap = ai;
+       do {
+/* char ai_family[naddrs]; */
+               *family++ = ap->ai_family;
+/* char ai_addr[naddrs][4 or 16]; */
+               if (ap->ai_family == AF_INET) {
+                       memcpy(p, &(((struct sockaddr_in*)(ap->ai_addr))->sin_addr), 4);
+                       p += 4;
+               } else {
+                       memcpy(p, &(((struct sockaddr_in6*)(ap->ai_addr))->sin6_addr), 16);
+                       p += 16;
+               }
+               ap = ap->ai_next;
+       } while (ap);
+/* char ai_canonname[canonlen]; */
+       if (ai->ai_canonname)
+               strcpy(family, ai->ai_canonname);
+       log(L_DEBUG, "sz:%u realsz:%u", sz, family + strsize(ai->ai_canonname) - (char*)resp);
+ ret:
+       /* glibc 2.3.6 segfaults here sometimes
+        * (maybe my mistake, fixed by "ai = NULL;" above).
+        * Since we are in worker and are going to exit anyway, why bother? */
+       /*freeaddrinfo(ai);*/
+       return resp;
+}
+
+
+/*
+** Cache management
+*/
+
+/* one 8-element "cacheline" */
+typedef user_req *cacheline_t[8];
+static unsigned cache_size;
+/* Points to cacheline_t  cache[cache_size] array, or in other words,
+ * points to user_req*    cache[cache_size][8] array */
+static cacheline_t *cache;
+static unsigned cached_cnt;
+static unsigned cache_access_cnt = 1; /* prevent division by zero */
+static unsigned cache_hit_cnt = 1;
+static unsigned last_age_time;
+static unsigned aging_interval_ms;
+static unsigned min_aging_interval_ms;
+
+static response_header *ureq_response(user_req *ureq)
+{
+       /* Skip query part, find answer part
+        * (answer is 32-bit aligned) */
+       return (void*) ((char*)ureq + ((ureq_size(ureq) + 3) & ~3));
+}
+
+/* This hash is supposed to be good for short textual data */
+static uint32_t bernstein_hash(void *p, unsigned sz, uint32_t hash)
+{
+       uint8_t *key = p;
+       do {
+               hash = (32 * hash + hash) ^ *key++;
+       } while (--sz);
+       return hash;
+}
+
+static void free_refcounted_ureq(user_req **ureqp)
+{
+       user_req *ureq = *ureqp;
+
+       if (!CACHED_ENTRY(ureq))
+               return;
+
+       if (ureq->refcount) {
+               ureq->refcount--;
+       } else {
+               log(L_DEBUG2, "refcount == 0, free(%p)", ureq);
+               free(ureq);
+       }
+       *ureqp = NULL;
+}
+
+static user_req **lookup_in_cache(user_req *ureq)
+{
+       user_req **cacheline;
+       int free_cache;
+       unsigned hash;
+       unsigned i;
+       unsigned ureq_sz = ureq_size(ureq);
+
+       /* prevent overflow and division by zero */
+       cache_access_cnt++;
+       if ((int)cache_access_cnt < 0) {
+               cache_access_cnt = (cache_access_cnt >> 1) + 1;
+               cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
+       }
+
+       hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
+       log(L_DEBUG2, "hash:%08x", hash);
+       hash = hash % cache_size;
+       cacheline = cache[hash];
+
+       free_cache = -1;
+       for (i = 0; i < 8; i++) {
+               user_req *cached = CACHE_PTR(cacheline[i]);
+               if (!cached) {
+                       if (free_cache == -1)
+                               free_cache = i;
+                       continue;
+               }
+               /* ureq->version is always 2 and is reused in cache
+                * for other purposes, we need to skip it here */
+               if (memcmp(&ureq->type, &cached->type, ureq_sz - offsetof(user_req, type)) == 0) {
+                       log(L_DEBUG, "found in cache[%u][%u]", hash, i);
+                       cache_hit_cnt++;
+                       return &cacheline[i];
+               }
+       }
+
+       if (free_cache >= 0) {
+               cached_cnt++;
+               i = free_cache;
+               log(L_DEBUG, "not found, using free cache[%u][%u]", hash, i);
+               goto ret;
+       }
+
+       unsigned oldest_idx = 0;
+       unsigned oldest_age = 0;
+       for (i = 0; i < 8; i++) {
+               unsigned age = cache_age(cacheline[i]);
+               if (age > oldest_age) {
+                       oldest_age = age;
+                       oldest_idx = i;
+               }
+       }
+       if (oldest_age == 0) {
+               /* All entries in cacheline are "future" entries!
+                * This is very unlikely, but we must still work correctly.
+                * We call this "fake cache entry".
+                * The data will be "cached" only for the duration
+                * of this client's request lifetime.
+                */
+               log(L_DEBUG, "not found, and cache[%u] is full: using fake cache entry", hash);
+               return NULL;
+       }
+       i = oldest_idx;
+       log(L_DEBUG, "not found, freeing and reusing cache[%u][%u] (age %u)", hash, i, oldest_age);
+       free_refcounted_ureq(&cacheline[i]);
+
+ ret:
+       cacheline[i] = MAKE_FUTURE_PTR(ureq);
+       return &cacheline[i];
+}
+
+static void age_cache(unsigned free_all, int srv)
+{
+       user_req **cp = *cache;
+       int i;
+       unsigned sv = cached_cnt;
+
+       log(L_DEBUG, "aging cache, srv:%d, free_all:%u", srv, free_all);
+       if (srv == -1 || free_all)
+               aging_interval_ms = INT_MAX;
+       i = cache_size * 8;
+       do {
+               user_req *cached = *cp;
+               if (CACHED_ENTRY(cached) && cached != NULL) {
+                       int csrv = type_to_srv[cached->type];
+                       if (srv == -1 || srv == csrv) {
+                               if (free_all) {
+                                       cached_cnt--;
+                                       free_refcounted_ureq(cp);
+                               } else {
+                                       unsigned age = cache_age(cached);
+                                       response_header *resp = ureq_response(cached);
+                                       unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
+                                       if (age >= ttl) {
+                                               log(L_DEBUG2, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
+                                               cached_cnt--;
+                                               free_refcounted_ureq(cp);
+                                       } else if (srv == -1) {
+                                               ttl -= age;
+                                               if (aging_interval_ms > ttl)
+                                                       aging_interval_ms = ttl;
+                                       }
+                               }
+                       }
+               }
+               cp++;
+       } while (--i);
+       log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
+       log(L_DEBUG2, "aging interval now %u ms", aging_interval_ms);
+}
+
+
+/*
+** Worker child
+*/
+
+/* Spawns a worker and feeds it with user query on stdin */
+/* Returns stdout fd of the worker, in blocking mode */
+static int create_and_feed_worker(user_req *ureq)
+{
+       pid_t pid;
+       struct {
+               int rd;
+               int wr;
+       } to_child, to_parent;
+
+       /* NB: these pipe fds are in blocking mode and non-CLOEXECed */
+       xpipe(&to_child.rd);
+       xpipe(&to_parent.rd);
+
+       pid = vfork();
+       if (pid < 0) /* error */
+               perror_and_die("vfork");
+       if (!pid) { /* child */
+               char param[sizeof(int)*3 + 2];
+               char *argv[3];
+
+               close(to_child.wr);
+               close(to_parent.rd);
+               xmovefd(to_child.rd, 0);
+               xmovefd(to_parent.wr, 1);
+               sprintf(param, "%u", debug);
+               argv[0] = (char*) "worker_nscd";
+               argv[1] = param;
+               argv[2] = NULL;
+               /* Re-exec ourself, cleaning up all allocated memory.
+                * fds in parent are marked CLOEXEC and will be closed too
+                * (modulo bugs) */
+               /* Try link name first: it's better to have comm field
+                * of "nscd" than "exe" (pgrep reported to fail to find us
+                * by name when comm field contains "exe") */
+               execve(self_exe_points_to, argv, argv+2);
+               xexecve("/proc/self/exe", argv, argv+2);
+       }
+
+       /* parent */
+       close(to_child.rd);
+       close(to_parent.wr);
+       /* We do not expect child to block for any noticeably long time,
+        * and also we expect write to be one-piece one:
+        * ureq size is <= 1k and pipes are guaranteed to accept
+        * at least PIPE_BUF at once */
+       xsafe_write(to_child.wr, ureq, ureq_size(ureq));
+
+       close(to_child.wr);
+       close_on_exec(to_parent.rd);
+       return to_parent.rd;
+}
+
+static user_req *worker_ureq;
+
+#if DEBUG_BUILD
+static const char *req_str(unsigned type, const char *buf)
+{
+       if (type == GETHOSTBYADDR) {
+               struct in_addr in;
+               in.s_addr = *((uint32_t*)buf);
+               return inet_ntoa(in);
+       }
+       if (type == GETHOSTBYADDRv6) {
+               return "IPv6";
+       }
+       return buf;
+}
+#else
+const char *req_str(unsigned type, const char *buf);
+#endif
+
+static void worker_signal_handler(int sig)
+{
+#if DEBUG_BUILD
+       log(L_INFO, "worker:%d got sig:%d while handling req "
+               "type:%d(%s) key_len:%d '%s'",
+               getpid(), sig,
+               worker_ureq->type, typestr[worker_ureq->type],
+               worker_ureq->key_len,
+               req_str(worker_ureq->type, worker_ureq->reqbuf)
+       );
+#else
+       log(L_INFO, "worker:%d got sig:%d while handling req "
+               "type:%d key_len:%d",
+               getpid(), sig,
+               worker_ureq->type, worker_ureq->key_len);
+#endif
+       _exit(0);
+}
+
+static void worker(const char *param) NORETURN;
+static void worker(const char *param)
+{
+       user_req ureq;
+       void *resp;
+
+       debug = atoi(param);
+
+       worker_ureq = &ureq; /* for signal handler */
+
+       /* Make sure we won't hang, but rather die */
+       if (WORKER_TIMEOUT_SEC)
+               alarm(WORKER_TIMEOUT_SEC);
+
+       /* NB: fds 0, 1 are in blocking mode */
+
+       /* We block here (for a short time) */
+       /* Due to ureq size < PIPE_BUF read is atomic */
+       /* No error or size checking: we trust the parent */
+       safe_read(0, &ureq, sizeof(ureq));
+
+       signal(SIGSEGV,   worker_signal_handler);
+       signal(SIGBUS,    worker_signal_handler);
+       signal(SIGILL,    worker_signal_handler);
+       signal(SIGFPE,    worker_signal_handler);
+       signal(SIGABRT,   worker_signal_handler);
+#ifdef SIGSTKFLT
+       signal(SIGSTKFLT, worker_signal_handler);
+#endif
+
+       if (ureq.type == GETHOSTBYNAME
+        || ureq.type == GETHOSTBYNAMEv6
+       ) {
+               resp = marshal_hostent(
+                       ureq.type == GETHOSTBYNAME
+                       ? gethostbyname(ureq.reqbuf)
+                       : gethostbyname2(ureq.reqbuf, AF_INET6)
+               );
+       } else if (ureq.type == GETHOSTBYADDR
+        || ureq.type == GETHOSTBYADDRv6
+       ) {
+               resp = marshal_hostent(gethostbyaddr(ureq.reqbuf, ureq.key_len,
+                       (ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
+               ));
+       } else if (ureq.type == GETPWBYNAME) {
+               struct passwd *pw;
+               log(L_DEBUG2, "getpwnam('%s')", ureq.reqbuf);
+               pw = getpwnam(ureq.reqbuf);
+               log(L_DEBUG2, "getpwnam result:%p", pw);
+               resp = marshal_passwd(pw);
+       } else if (ureq.type == GETPWBYUID) {
+               resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
+       } else if (ureq.type == GETGRBYNAME) {
+               struct group *gr = getgrnam(ureq.reqbuf);
+               resp = marshal_group(gr);
+       } else if (ureq.type == GETGRBYGID) {
+               struct group *gr = getgrgid(atoi(ureq.reqbuf));
+               resp = marshal_group(gr);
+       } else if (ureq.type == GETAI) {
+               resp = obtain_addrinfo(ureq.reqbuf);
+       } else /*if (ureq.type == INITGROUPS)*/ {
+               resp = obtain_initgroups(ureq.reqbuf);
+       }
+
+       if (!((response_header*)resp)->found) {
+               /* Parent knows about this special case */
+               xfull_write(1, resp, 8);
+       } else {
+               /* Responses can be big (getgrnam("guest") on a big user db),
+                * we cannot rely on them being atomic. full_write loops
+                * if needed */
+               xfull_write(1, resp, ((response_header*)resp)->version_or_size);
+       }
+       _exit(0);
+}
+
+
+/*
+** Main loop
+*/
+
+static const char checked_filenames[][sizeof("/etc/passwd")] = {
+       [SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
+       [SRV_GROUP]  = "/etc/group",
+       [SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
+};
+
+static long checked_status[ARRAY_SIZE(checked_filenames)];
+
+static void check_files(int srv)
+{
+       struct stat tsb;
+       const char *file = checked_filenames[srv];
+       long v;
+
+       memset(&tsb, 0, sizeof(tsb));
+       stat(file, &tsb); /* ignore errors */
+       /* Comparing struct stat's was giving false positives.
+        * Extracting only those fields which are interesting: */
+       v = (long)tsb.st_mtime ^ (long)tsb.st_size ^ (long)tsb.st_ino; /* ^ (long)tsb.st_dev ? */
+
+       if (v != checked_status[srv]) {
+               checked_status[srv] = v;
+               log(L_INFO, "detected change in %s", file);
+               age_cache(/*free_all:*/ 1, srv);
+       }
+}
+
+/* Returns 1 if we immediately have the answer */
+static int handle_client(int i)
+{
+       int srv;
+       user_req *ureq = cinfo[i].ureq;
+       user_req **cache_pp;
+       user_req *ureq_and_resp;
+
+#if DEBUG_BUILD
+       log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
+                       ureq->version, ureq->type,
+                       ureq->type < ARRAY_SIZE(typestr) ? typestr[ureq->type] : "?",
+                       ureq->key_len, req_str(ureq->type, ureq->reqbuf));
+#endif
+
+       if (ureq->version != NSCD_VERSION) {
+               log(L_INFO, "wrong version");
+               close_client(i);
+               return 0;
+       }
+       if (ureq->key_len > sizeof(ureq->reqbuf)) {
+               log(L_INFO, "bogus key_len %u - ignoring", ureq->key_len);
+               close_client(i);
+               return 0;
+       }
+       if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
+               log(L_INFO, "read %d, need to read %d",
+                       cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
+               return 0; /* more to read */
+       }
+       if (cinfo[i].bytecnt > USER_HDR_SIZE + ureq->key_len) {
+               log(L_INFO, "read overflow: %u > %u",
+                       (int)cinfo[i].bytecnt, (int)(USER_HDR_SIZE + ureq->key_len));
+               close_client(i);
+               return 0;
+       }
+       if (unsupported_ureq_type(ureq->type)) {
+               /* We don't know this request. Just close the connection.
+                * (glibc client interprets this like "not supported by this nscd")
+                * Happens very often, thus DEBUG, not INFO */
+               log(L_DEBUG, "unsupported query, dropping");
+               close_client(i);
+               return 0;
+       }
+       srv = type_to_srv[ureq->type];
+       if (!config.srv_enable[srv]) {
+               log(L_INFO, "service %d is disabled, dropping", srv);
+               close_client(i);
+               return 0;
+       }
+
+       hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+
+       if (ureq->type == SHUTDOWN
+        || ureq->type == INVALIDATE
+       ) {
+#ifdef SO_PEERCRED
+               struct ucred caller;
+               socklen_t optlen = sizeof(caller);
+               if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
+                       log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
+                       close_client(i);
+                       return 0;
+               }
+               if (caller.uid != 0) {
+                       log(L_INFO, "special request from non-root - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+#endif
+               if (ureq->type == SHUTDOWN) {
+                       log(L_INFO, "got shutdown request, exiting");
+                       exit(0);
+               }
+               if (!ureq->key_len || ureq->reqbuf[ureq->key_len - 1]) {
+                       log(L_INFO, "malformed invalidate request - ignoring");
+                       close_client(i);
+                       return 0;
+               }
+               log(L_INFO, "got invalidate request, flushing cache");
+               /* Frees entire cache. TODO: replace -1 with service (in ureq->reqbuf) */
+               age_cache(/*free_all:*/ 1, -1);
+               close_client(i);
+               return 0;
+       }
+
+       if (ureq->type != GETHOSTBYADDR
+        && ureq->type != GETHOSTBYADDRv6
+       ) {
+               if (ureq->key_len && ureq->reqbuf[ureq->key_len - 1] != '\0') {
+                       log(L_INFO, "badly terminated buffer");
+                       close_client(i);
+                       return 0;
+               }
+       }
+
+       if (config.check_files[srv]) {
+               check_files(srv);
+       }
+
+       cache_pp = lookup_in_cache(ureq);
+       ureq_and_resp = cache_pp ? *cache_pp : NULL;
+
+       if (ureq_and_resp) {
+               if (CACHED_ENTRY(ureq_and_resp)) {
+                       /* Found. Save ptr to response into cinfo and return */
+                       response_header *resp = ureq_response(ureq_and_resp);
+                       unsigned sz = resp->version_or_size;
+
+                       log(L_DEBUG, "sz:%u", sz);
+                       hex_dump(resp, sz);
+                       ureq_and_resp->refcount++; /* cache shouldn't free it under us! */
+                       pfd[i].events = POLLOUT; /* we want to write out */
+                       cinfo[i].resptr = ureq_and_resp;
+                       /*cinfo[i].respos = 0; - already is */
+                       /* prevent future matches with anything */
+                       cinfo[i].cache_pp = (void *) 1;
+                       return 1; /* "ready to write data out to client" */
+               }
+
+               /* Not found. Remember a pointer where it will appear */
+               cinfo[i].cache_pp = cache_pp;
+
+               /* If it does not point to our own ureq buffer... */
+               if (CACHE_PTR(ureq_and_resp) != ureq) {
+                       /* We are not the first client who wants this */
+                       log(L_DEBUG, "another request is in progress (%p), waiting for its result", ureq_and_resp);
+                       MARK_PTR_SHARED(cache_pp); /* "please inform us when it's ready" */
+                       /* "we do not wait for client anymore" */
+                       cinfo[i].client_fd = pfd[i].fd;
+                       /* Don't wait on fd. Worker response will unblock us */
+                       pfd[i].events = 0;
+                       return 0;
+               }
+               /* else: lookup_in_cache inserted (ureq & 1) into *cache_pp:
+                * we are the first client to miss on this ureq. */
+       }
+
+       /* Start worker thread */
+       log(L_DEBUG, "stored %p in cache, starting a worker", ureq_and_resp);
+       /* Now we will wait on worker's fd, not client's! */
+       cinfo[i].client_fd = pfd[i].fd;
+       pfd[i].fd = create_and_feed_worker(ureq);
+       return 0;
+}
+
+static void prepare_for_writeout(unsigned i, user_req *cached)
+{
+       log(L_DEBUG2, "client %u: data is ready at %p", i, cached);
+
+       if (cinfo[i].client_fd) {
+               pfd[i].fd = cinfo[i].client_fd;
+               cinfo[i].client_fd = 0; /* "we don't wait for worker reply" */
+       }
+       pfd[i].events = POLLOUT;
+
+       /* Writeout position etc */
+       cinfo[i].resptr = cached;
+       /*cinfo[i].respos = 0; - already is */
+       /* if worker took some time to get info (e.g. DNS query),
+        * prevent client timeout from triggering at once */
+       cinfo[i].started_ms = g_now_ms;
+}
+
+/* Worker seems to be ready to write the response.
+ * When we return, response is fully read and stored in cache,
+ * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
+static void handle_worker_response(int i)
+{
+       struct { /* struct response_header + small body */
+               uint32_t version_or_size;
+               int32_t found;
+               char body[256 - 8];
+       } sz_and_found;
+       user_req *cached;
+       user_req *ureq;
+       response_header *resp;
+       unsigned sz, resp_sz;
+       unsigned ureq_sz_aligned;
+
+       cached = NULL;
+       ureq = cinfo[i].ureq;
+       ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
+
+       sz = full_read(pfd[i].fd, &sz_and_found, sizeof(sz_and_found));
+       if (sz < 8) {
+               /* worker was killed? */
+               log(L_DEBUG, "worker gave short reply:%u < 8", sz);
+               goto err;
+       }
+
+       resp_sz = sz_and_found.version_or_size;
+       if (resp_sz < sz || resp_sz > 0x0fffffff) { /* 256 mb */
+               error("BUG: bad size from worker:%u", resp_sz);
+               goto err;
+       }
+
+       /* Create new block of cached info */
+       cached = xzalloc(ureq_sz_aligned + resp_sz);
+       log(L_DEBUG2, "xzalloc(%u):%p sz:%u resp_sz:%u found:%u",
+                       ureq_sz_aligned + resp_sz, cached,
+                       sz, resp_sz,
+                       (int)sz_and_found.found
+                       );
+       resp = (void*) (((char*) cached) + ureq_sz_aligned);
+       memcpy(cached, ureq, ureq_size(ureq));
+       memcpy(resp, &sz_and_found, sz);
+       if (sz_and_found.found && resp_sz > sz) {
+               /* We need to read data only if it's found
+                * (otherwise worker sends only 8 bytes).
+                *
+                * Replies can be big (getgrnam("guest") on a big user db),
+                * we cannot rely on them being atomic. However, we know
+                * that worker _always_ gives reply in one full_write(),
+                * so we loop and read it all
+                * (looping is implemented inside full_read())
+                */
+               if (full_read(pfd[i].fd, ((char*) resp) + sz, resp_sz - sz) != resp_sz - sz) {
+                       /* worker was killed? */
+                       log(L_DEBUG, "worker gave short reply, free(%p)", cached);
+ err:
+                       free(cached);
+                       cached = NULL;
+                       goto wo;
+               }
+       }
+       set_cache_timestamp(cached);
+       hex_dump(resp, resp_sz);
+
+ wo:
+       close(pfd[i].fd);
+
+       /* Save in cache */
+       unsigned ref = 0;
+       user_req **cache_pp = cinfo[i].cache_pp;
+       if (cache_pp != NULL) { /* if not a fake entry */
+               ureq = *cache_pp;
+               *cache_pp = cached;
+               if (CACHE_SHARED(ureq)) {
+                       /* Other clients wait for this response too,
+                        * wake them (and us) up and set refcount = no_of_clients */
+                       unsigned j;
+
+                       for (j = 2; j < num_clients; j++) {
+                               if (cinfo[j].cache_pp == cache_pp) {
+                                       /* This client uses the same cache entry */
+                                       ref++;
+                                       /* prevent future matches with anything */
+                                       cinfo[j].cache_pp = (void *) 1;
+                                       prepare_for_writeout(j, cached);
+                               }
+                       }
+                       goto ret;
+               }
+               /* prevent future matches with anything */
+               cinfo[i].cache_pp = (void *) 1;
+               ref = 1;
+       }
+
+       prepare_for_writeout(i, cached);
+ret:
+       /* cache shouldn't free it under us! */
+       if (cached)
+               cached->refcount = ref;
+       aging_interval_ms = min_aging_interval_ms;
+}
+
+static void main_loop(void)
+{
+       /* 1/2 of smallest negative TTL */
+       min_aging_interval_ms = config.nttl[0];
+       if (min_aging_interval_ms > config.nttl[1]) min_aging_interval_ms = config.nttl[1];
+       if (min_aging_interval_ms > config.nttl[2]) min_aging_interval_ms = config.nttl[2];
+       min_aging_interval_ms = (min_aging_interval_ms / 2) | 1;
+       aging_interval_ms = min_aging_interval_ms;
+
+       while (1) {
+               int i, j;
+               int r;
+
+               r = SMALL_POLL_TIMEOUT_MS;
+               if (num_clients <= 2 && !cached_cnt)
+                       r = -1; /* infinite */
+               else if (num_clients < max_reqnum)
+                       r = aging_interval_ms;
+#if 0 /* Debug: leak detector */
+               {
+                       static unsigned long long cnt;
+                       static unsigned long low_malloc = -1L;
+                       static unsigned long low_sbrk = -1L;
+                       void *p = malloc(540); /* should not be too small */
+                       void *s = sbrk(0);
+                       free(p);
+                       if ((unsigned long)p < low_malloc)
+                               low_malloc = (unsigned long)p;
+                       if ((unsigned long)s < low_sbrk)
+                               low_sbrk = (unsigned long)s;
+                       log(L_INFO, "poll %llu (%d ms). clients:%u cached:%u %u/%u malloc:%p (%lu), sbrk:%p (%lu)",
+                               cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt,
+                               p, (unsigned long)p - low_malloc,
+                               s, (unsigned long)s - low_sbrk);
+                       cnt++;
+               }
+#else
+               log(L_DEBUG, "poll %d ms. clients:%u cached:%u hit ratio:%u/%u",
+                               r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
+#endif
+
+               r = poll(pfd, num_clients, r);
+               log(L_DEBUG2, "poll returns %d", r);
+               if (r < 0) {
+                       if (errno != EINTR)
+                               perror_and_die("poll");
+                       continue;
+               }
+
+               /* Everything between polls never sleeps.
+                * There is no blocking I/O (except when we talk to worker thread
+                * which is guaranteed to not block us for long) */
+
+               g_now_ms = monotonic_ms();
+               if (r == 0)
+                       goto skip_fd_checks;
+
+               for (i = 0; i < 2; i++) {
+                       int cfd;
+                       if (!pfd[i].revents)
+                               continue;
+                       /* pfd[i].revents = 0; - not needed */
+                       cfd = accept(pfd[i].fd, NULL, NULL);
+                       if (cfd < 0) {
+                               /* odd... poll() says we can accept but accept failed? */
+                               log(L_DEBUG2, "accept failed with %s", strerror(errno));
+                               continue;
+                       }
+                       ndelay_on(cfd);
+                       close_on_exec(cfd);
+                       /* x[num_clients] is next free element, taking it */
+                       log(L_DEBUG2, "new client %d, fd %d", num_clients, cfd);
+                       pfd[num_clients].fd = cfd;
+                       pfd[num_clients].events = POLLIN;
+                       /* this will make us do read() in next for() loop: */
+                       pfd[num_clients].revents = POLLIN;
+                       memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
+                       /* cinfo[num_clients].bytecnt = 0; - done */
+                       cinfo[num_clients].started_ms = g_now_ms;
+                       cinfo[num_clients].bufidx = alloc_buf_no();
+                       cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
+                       num_clients++;
+                       if (num_clients >= max_reqnum) {
+                               /* stop accepting new connects for now */
+                               pfd[0].events = pfd[0].revents = 0;
+                               pfd[1].events = pfd[1].revents = 0;
+                       }
+               }
+               for (; i < num_clients; i++) {
+                       if (!pfd[i].revents)
+                               continue;
+                       log(L_DEBUG2, "pfd[%d].revents:0x%x", i, pfd[i].revents);
+                       /* pfd[i].revents = 0; - not needed */
+
+                       /* "Write out result" case */
+                       if (pfd[i].revents == POLLOUT) {
+                               response_header *resp;
+                               uint32_t resp_sz;
+                               if (!cinfo[i].resptr) {
+                                       /* corner case: worker gave bad response earlier */
+                                       close_client(i);
+                                       continue;
+                               }
+ write_out:
+                               resp = ureq_response(cinfo[i].resptr);
+                               resp_sz = resp->version_or_size;
+                               resp->version_or_size = NSCD_VERSION;
+                               errno = 0;
+                               r = safe_write(pfd[i].fd, ((char*) resp) + cinfo[i].respos, resp_sz - cinfo[i].respos);
+                               resp->version_or_size = resp_sz;
+
+                               if (r < 0 && errno == EAGAIN)
+                                       continue;
+                               if (r <= 0) { /* client isn't there anymore */
+                                       log(L_DEBUG, "client %d is gone (write returned:%d err:%s)",
+                                                       i, r, errno ? strerror(errno) : "-");
+ write_out_is_done:
+                                       if (cinfo[i].cache_pp == NULL) {
+                                               log(L_DEBUG, "client %d: freeing fake cache entry %p", i, cinfo[i].resptr);
+                                               free(cinfo[i].resptr);
+                                       } else {
+                                               /* Most of the time, it is not freed here,
+                                                * only refcounted--. Freeing happens
+                                                * if it was deleted from cache[] but retained
+                                                * for writeout. */
+                                               free_refcounted_ureq(&cinfo[i].resptr);
+                                       }
+                                       close_client(i);
+                                       continue;
+                               }
+                               cinfo[i].respos += r;
+                               if (cinfo[i].respos >= resp_sz) {
+                                       /* We wrote everything */
+                                       /* No point in trying to get next request, it won't come.
+                                        * glibc 2.4 client closes its end after each request,
+                                        * without testing for EOF from server. strace:
+                                        * ...
+                                        * read(3, "www.google.com\0\0", 16) = 16
+                                        * close(3) = 0
+                                        */
+                                       log(L_DEBUG, "client %u: sent answer %u bytes", i, cinfo[i].respos);
+                                       goto write_out_is_done;
+                               }
+                       }
+
+                       /* "Read reply from worker" case. Worker may be
+                        * already dead, revents may contain other bits too */
+                       if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
+                               log(L_DEBUG, "reading response for client %u", i);
+                               handle_worker_response(i);
+                               /* We can immediately try to write a response
+                                * to client */
+                               goto write_out;
+                       }
+
+                       /* POLLHUP means pfd[i].fd is closed by peer.
+                        * POLLHUP+POLLOUT is seen when we switch for writeout
+                        * and see that pfd[i].fd is closed by peer. */
+                       if ((pfd[i].revents & ~POLLOUT) == POLLHUP) {
+                               int is_client = (cinfo[i].client_fd == 0 || cinfo[i].client_fd == pfd[i].fd);
+                               log(L_INFO, "%s %u disappeared (got POLLHUP on fd %d)",
+                                       is_client ? "client" : "worker",
+                                       i,
+                                       pfd[i].fd
+                               );
+                               if (is_client)
+                                       close_client(i);
+                               else {
+                                       /* Read worker output anyway, error handling
+                                        * in that function deals with short read.
+                                        * Simply closing client is wrong: it leaks
+                                        * shared future entries. */
+                                       handle_worker_response(i);
+                               }
+                               continue;
+                       }
+
+                       /* All strange and unexpected cases */
+                       if (pfd[i].revents != POLLIN) {
+                               /* Not just "can read", but some other bits are there */
+                               log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
+                               close_client(i);
+                               continue;
+                       }
+
+                       /* "Read request from client" case */
+                       r = safe_read(pfd[i].fd, (char*)(cinfo[i].ureq) + cinfo[i].bytecnt, MAX_USER_REQ_SIZE - cinfo[i].bytecnt);
+                       if (r < 0) {
+                               log(L_DEBUG2, "error reading from client: %s", strerror(errno));
+                               if (errno == EAGAIN)
+                                       continue;
+                               close_client(i);
+                               continue;
+                       }
+                       if (r == 0) {
+                               log(L_INFO, "premature EOF from client, dropping");
+                               close_client(i);
+                               continue;
+                       }
+                       cinfo[i].bytecnt += r;
+                       if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
+                               if (handle_client(i)) {
+                                       /* Response is found in cache! */
+                                       goto write_out;
+                               }
+                       }
+               } /* for each client[2..num_clients-1] */
+
+ skip_fd_checks:
+               /* Age cache */
+               if ((g_now_ms - last_age_time) >= aging_interval_ms) {
+                       last_age_time = g_now_ms;
+                       age_cache(/*free_all:*/ 0, -1);
+               }
+
+               /* Close timed out client connections */
+               for (i = 2; i < num_clients; i++) {
+                       if (pfd[i].fd != 0 /* not closed yet? */ ////
+                        && cinfo[i].client_fd == 0 /* do we still wait for client, not worker? */
+                        && (g_now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
+                       ) {
+                               log(L_INFO, "timed out waiting for client %u (%u ms), dropping",
+                                       i, (unsigned)(g_now_ms - cinfo[i].started_ms));
+                               close_client(i);
+                       }
+               }
+
+               if (!cnt_closed)
+                       continue;
+
+               /* We closed at least one client, coalesce pfd[], cinfo[] */
+               if (min_closed + cnt_closed >= num_clients) {
+                       /* clients [min_closed..num_clients-1] are all closed */
+                       /* log(L_DEBUG, "taking shortcut"); - almost always happens */
+                       goto coalesce_done;
+               }
+               j = min_closed;
+               i = min_closed + 1;
+               while (i < num_clients) {
+                       while (1) {
+                               if (pfd[i].fd)
+                                       break;
+                               if (++i >= num_clients)
+                                       goto coalesce_done;
+                       }
+                       pfd[j] = pfd[i];
+                       cinfo[j++] = cinfo[i++];
+               }
+
+ coalesce_done:
+               num_clients -= cnt_closed;
+               log(L_DEBUG, "removing %d closed clients. clients:%d", cnt_closed, num_clients);
+               min_closed = INT_MAX;
+               cnt_closed = 0;
+               /* start accepting new connects */
+               pfd[0].events = POLLIN;
+               pfd[1].events = POLLIN;
+       } /* while (1) */
+}
+
+
+/*
+** Initialization
+*/
+
+#define NSCD_PIDFILE    "/var/run/nscd/nscd.pid"
+#define NSCD_DIR        "/var/run/nscd"
+#define NSCD_SOCKET     "/var/run/nscd/socket"
+#define NSCD_SOCKET_OLD "/var/run/.nscd_socket"
+
+static smallint wrote_pidfile;
+
+static void cleanup_on_signal(int sig)
+{
+       if (wrote_pidfile)
+               unlink(NSCD_PIDFILE);
+       unlink(NSCD_SOCKET_OLD);
+       unlink(NSCD_SOCKET);
+       exit(0);
+}
+
+static void write_pid(void)
+{
+       FILE *pid = fopen(NSCD_PIDFILE, "w");
+       if (!pid)
+               return;
+       fprintf(pid, "%d\n", getpid());
+       fclose(pid);
+       wrote_pidfile = 1;
+}
+
+/* Open a listening nscd server socket */
+static int open_socket(const char *name)
+{
+       struct sockaddr_un sun;
+       int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+       if (sock < 0)
+               perror_and_die("cannot create unix domain socket");
+       ndelay_on(sock);
+       close_on_exec(sock);
+       sun.sun_family = AF_UNIX;
+       strcpy(sun.sun_path, name);
+       unlink(name);
+       if (bind(sock, (struct sockaddr *) &sun, sizeof(sun)) < 0)
+               perror_and_die("bind(%s)", name);
+       if (chmod(name, 0666) < 0)
+               perror_and_die("chmod(%s)", name);
+       if (listen(sock, (max_reqnum/8) | 1) < 0)
+               perror_and_die("listen");
+       return sock;
+}
+
+static const struct option longopt[] = {
+       /* name, has_arg, int *flag, int val */
+       { "debug"      , no_argument      , NULL, 'd' },
+       { "config-file", required_argument, NULL, 'f' },
+       { "invalidate" , required_argument, NULL, 'i' },
+       { "shutdown"   , no_argument      , NULL, 'K' },
+       { "nthreads"   , required_argument, NULL, 't' },
+       { "version"    , no_argument      , NULL, 'V' },
+       { "help"       , no_argument      , NULL, '?' },
+       { "usage"      , no_argument      , NULL, '?' },
+       /* just exit(0). TODO: "test" connect? */
+       { "statistic"  , no_argument      , NULL, 'g' },
+       { "secure"     , no_argument      , NULL, 'S' }, /* ? */
+       { }
+};
+
+static const char *const help[] = {
+       "Do not daemonize; log to stderr (-dd: more verbosity)",
+       "File to read configuration from",
+       "Invalidate cache",
+       "Shut the server down",
+       "Serve N requests in parallel",
+       "Version",
+};
+
+static void print_help_and_die(void)
+{
+       const struct option *opt = longopt;
+       const char *const *h = help;
+
+       puts("Usage: nscd [OPTION...]\n"
+            "Name Service Cache Daemon\n");
+       do {
+               printf("\t" "-%c,--%-11s %s\n", opt->val, opt->name, *h);
+               h++;
+               opt++;
+       } while (opt->val != '?');
+       exit(1);
+}
+
+static char *skip_service(int *srv, const char *s)
+{
+       if (strcmp("passwd", s) == 0) {
+               *srv = SRV_PASSWD;
+               s++;
+       } else if (strcmp("group", s) == 0) {
+               *srv = SRV_GROUP;
+       } else if (strcmp("hosts", s) == 0) {
+               *srv = SRV_HOSTS;
+       } else {
+               return NULL;
+       }
+       return skip_whitespace(s + 6);
+}
+
+static void handle_null(const char *str, int srv) {}
+
+static void handle_logfile(const char *str, int srv)
+{
+       config.logfile = xstrdup(str);
+}
+
+static void handle_debuglvl(const char *str, int srv)
+{
+       debug |= (uint8_t) getnum(str);
+}
+
+static void handle_threads(const char *str, int srv)
+{
+       unsigned n = getnum(str);
+       if (max_reqnum < n)
+               max_reqnum = n;
+}
+
+static void handle_user(const char *str, int srv)
+{
+       config.user = xstrdup(str);
+}
+
+static void handle_enable(const char *str, int srv)
+{
+       config.srv_enable[srv] = ((str[0] | 0x20) == 'y');
+}
+
+static void handle_pttl(const char *str, int srv)
+{
+       config.pttl[srv] = getnum(str);
+}
+
+static void handle_nttl(const char *str, int srv)
+{
+       config.nttl[srv] = getnum(str);
+}
+
+static void handle_size(const char *str, int srv)
+{
+       config.size[srv] = getnum(str);
+}
+
+static void handle_chfiles(const char *str, int srv)
+{
+       config.check_files[srv] = ((str[0] | 0x20) == 'y');
+}
+
+static void parse_conffile(const char *conffile, int warn)
+{
+       static const struct confword {
+               const char *str;
+               void (*handler)(const char *, int);
+       } conf_words[] = {
+               { "_" "logfile"               , handle_logfile  },
+               { "_" "debug-level"           , handle_debuglvl },
+               { "_" "threads"               , handle_threads  },
+               { "_" "max-threads"           , handle_threads  },
+               { "_" "server-user"           , handle_user     },
+               /* ignore: any user can stat */
+               { "_" "stat-user"             , handle_null     },
+               { "_" "paranoia"              , handle_null     }, /* ? */
+               /* ignore: design goal is to never crash/hang */
+               { "_" "reload-count"          , handle_null     },
+               { "_" "restart-interval"      , handle_null     },
+               { "S" "enable-cache"          , handle_enable   },
+               { "S" "positive-time-to-live" , handle_pttl     },
+               { "S" "negative-time-to-live" , handle_nttl     },
+               { "S" "suggested-size"        , handle_size     },
+               { "S" "check-files"           , handle_chfiles  },
+               { "S" "persistent"            , handle_null     }, /* ? */
+               { "S" "shared"                , handle_null     }, /* ? */
+               { "S" "auto-propagate"        , handle_null     }, /* ? */
+               { }
+       };
+
+       char buf[128];
+       FILE *file = fopen(conffile, "r");
+       int lineno = 0;
+
+       if (!file) {
+               if (conffile != default_conffile)
+                       perror_and_die("cannot open %s", conffile);
+               return;
+       }
+
+       while (fgets(buf, sizeof(buf), file) != NULL) {
+               const struct confword *word;
+               char *p;
+               int len = strlen(buf);
+
+               lineno++;
+               if (len) {
+                       if (buf[len-1] != '\n') {
+                               if (len >= sizeof(buf) - 1)
+                                       error_and_die("%s:%d: line is too long", conffile, lineno);
+                               len++; /* last line, not terminated by '\n' */
+                       }
+                       buf[len-1] = '\0';
+               }
+               p = strchr(buf, '#');
+               if (p)
+                       *p = '\0';
+
+               p = skip_whitespace(buf);
+               if (!*p)
+                       continue;
+               *skip_non_whitespace(p) = '\0';
+               word = conf_words;
+               while (1) {
+                       if (strcmp(word->str + 1, p) == 0) {
+                               int srv = 0;
+                               p = skip_whitespace(p + strlen(p) + 1);
+                               *skip_non_whitespace(p) = '\0';
+                               if (word->str[0] == 'S') {
+                                       char *p2 = skip_service(&srv, p);
+                                       if (!p2) {
+                                               if (warn)
+                                                       error("%s:%d: ignoring unknown service name '%s'", conffile, lineno, p);
+                                               break;
+                                       }
+                                       p = p2;
+                                       *skip_non_whitespace(p) = '\0';
+                               }
+                               word->handler(p, srv);
+                               break;
+                       }
+                       word++;
+                       if (!word->str) {
+                               if (warn)
+                                       error("%s:%d: ignoring unknown directive '%s'", conffile, lineno, p);
+                               break;
+                       }
+               }
+       }
+       fclose(file);
+}
+
+
+/* "XX,XX[,XX]..." -> gid_t[] */
+static gid_t* env_U_to_uid_and_gids(const char *str, int *sizep)
+{
+       const char *sp;
+       gid_t *ug, *gp;
+       int ng;
+
+       sp = str;
+       ng = 1;
+       while (*sp)
+               if (*sp++ == ',')
+                       ng++;
+       ug = xmalloc(ng * sizeof(ug[0]));
+
+       ng = 0;
+       gp = ug;
+       sp = str;
+       errno = 0;
+       while (1) {
+               ng++;
+               *gp++ = strtoul(sp, (char**)&sp, 16);
+               if (errno || (*sp != ',' && *sp != '\0'))
+                       error_and_die("internal error");
+               if (*sp == '\0')
+                       break;
+               sp++;
+       }
+
+       *sizep = ng;
+       return ug;
+}
+
+
+static char* user_to_env_U(const char *user)
+{
+       int ng;
+       char *ug_str, *sp;
+       gid_t *ug, *gp;
+       struct passwd *pw;
+
+       pw = getpwnam(user);
+       if (!pw)
+               perror_and_die("user '%s' is not known", user);
+
+       ng = 64;
+       /* 0th cell will be used for uid */
+       ug = xmalloc((1 + ng) * sizeof(ug[0]));
+       if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0) {
+               ug = xrealloc(ug, (1 + ng) * sizeof(ug[0]));
+               if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0)
+                       perror_and_die("can't get groups of user '%s'", user);
+       }
+       ng++;
+       ug[0] = pw->pw_uid;
+
+       /* How much do we need for "-Uxx,xx[,xx]..." string? */
+       ug_str = xmalloc((sizeof(unsigned long)+1)*2 * ng + 3);
+       gp = ug;
+       sp = ug_str;
+       *sp++ = 'U';
+       *sp++ = '=';
+       do {
+               sp += sprintf(sp, "%lx,", (unsigned long)(*gp++));
+       } while (--ng);
+       sp[-1] = '\0';
+
+       free(ug);
+       return ug_str;
+}
+
+
+/* not static - don't inline me, compiler! */
+void readlink_self_exe(void);
+void readlink_self_exe(void)
+{
+       char buf[PATH_MAX + 1];
+       ssize_t sz = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+       if (sz < 0)
+               perror_and_die("readlink %s failed", "/proc/self/exe");
+       buf[sz] = 0;
+       self_exe_points_to = xstrdup(buf);
+}
+
+
+static void special_op(const char *arg) NORETURN;
+static void special_op(const char *arg)
+{
+       static const user_req_header ureq = { NSCD_VERSION, SHUTDOWN, 0 };
+
+       struct sockaddr_un addr;
+       int sock;
+
+       sock = socket(PF_UNIX, SOCK_STREAM, 0);
+       if (sock < 0)
+               error_and_die("cannot create AF_UNIX socket");
+
+       addr.sun_family = AF_UNIX;
+       strcpy(addr.sun_path, NSCD_SOCKET);
+       if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0)
+               error_and_die("cannot connect to %s", NSCD_SOCKET);
+
+       if (!arg) { /* shutdown */
+               xfull_write(sock, &ureq, sizeof(ureq));
+               printf("sent shutdown request, exiting\n");
+       } else { /* invalidate */
+               size_t arg_len = strlen(arg) + 1;
+               struct {
+                       user_req_header req;
+                       char arg[arg_len];
+               } reqdata;
+               reqdata.req.version = NSCD_VERSION;
+               reqdata.req.type = INVALIDATE;
+               reqdata.req.key_len = arg_len;
+               memcpy(reqdata.arg, arg, arg_len);
+               xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
+               printf("sent invalidate(%s) request, exiting\n", arg);
+       }
+       exit(0);
+}
+
+
+/* Callback for glibc-2.15 */
+struct traced_file;
+static void do_nothing(size_t dbidx, struct traced_file *finfo)
+{
+       /* nscd from glibc-2.15 does something like this:
+       if (!dbs[dbidx].enabled || !dbs[dbidx].check_file)
+               return;
+       add_file_to_watch_list(finfo->fname);
+       */
+}
+
+/* This internal glibc function is called to disable trying to contact nscd.
+ * We _are_ nscd, so we need to do the lookups, and not recurse.
+ * Until 2.14, this function was taking no parameters.
+ * In 2.15, it takes a function pointer from hell.
+ */
+void __nss_disable_nscd(void (*hell)(size_t, struct traced_file*));
+
+
+int main(int argc, char **argv)
+{
+       int n;
+       unsigned opt_d_cnt;
+       const char *env_U;
+       const char *conffile;
+
+       /* make sure we don't get recursive calls */
+       __nss_disable_nscd(do_nothing);
+
+       if (argv[0][0] == 'w') /* "worker_nscd" */
+               worker(argv[1]);
+
+       setlinebuf(stdout);
+       setlinebuf(stderr);
+
+       /* Make sure stdio is not closed */
+       n = xopen3("/dev/null", O_RDWR, 0);
+       while (n < 2)
+               n = dup(n);
+       /* Close unexpected open file descriptors */
+       n |= 0xff; /* start from at least fd# 255 */
+       do {
+               close(n--);
+       } while (n > 2);
+
+       /* For idiotic kernels which disallow "exec /proc/self/exe" */
+       readlink_self_exe();
+
+       conffile = default_conffile;
+       opt_d_cnt = 0;
+       while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
+               switch (n) {
+               case 'd':
+                       opt_d_cnt++;
+                       debug &= ~D_DAEMON;
+                       break;
+               case 'f':
+                       conffile = optarg;
+                       break;
+               case 'i':
+                       /* invalidate */
+                       special_op(optarg); /* exits */
+               case 'K':
+                       /* shutdown server */
+                       special_op(NULL); /* exits */
+               case 'V':
+                       puts("unscd - nscd which does not hang, v."PROGRAM_VERSION);
+                       exit(0);
+               case 'g':
+                       exit(0);
+               case 't':
+                       /* N threads */
+                       max_reqnum = getnum(optarg);
+                       break;
+               case 'S':
+                       /* secure (?) */
+                       break;
+               default:
+                       print_help_and_die();
+               }
+       }
+       /* Multiple -d can bump debug regardless of nscd.conf:
+        * no -d or -d: 0, -dd: 1,
+        * -ddd: 3, -dddd: 7, -ddddd: 15
+        */
+       if (opt_d_cnt != 0)
+               debug |= (((1U << opt_d_cnt) >> 1) - 1) & L_ALL;
+
+       env_U = getenv("U");
+       /* Avoid duplicate warnings if $U exists */
+       parse_conffile(conffile, /* warn? */ (env_U == NULL));
+
+       /* I have a user report of (broken?) ldap nss library
+        * opening and never closing a socket to a ldap server,
+        * even across fork() and exec(). This messes up
+        * worker child's operations for the reporter.
+        *
+        * This strenghtens my belief that nscd _must not_ trust
+        * nss libs to be written correctly.
+        *
+        * Here, we need to jump through the hoops to guard against
+        * such problems. If config file has server-user setting, we need
+        * to setgroups + setuid. For that, we need to get uid and gid vector.
+        * And that means possibly using buggy nss libs.
+        * We will do it here, but then we will re-exec, passing uid+gids
+        * in an environment variable.
+        */
+       if (!env_U && config.user) {
+               /* user_to_env_U() does getpwnam and getgrouplist */
+               if (putenv(user_to_env_U(config.user)))
+                       error_and_die("out of memory");
+               /* fds leaked by nss will be closed by execed copy */
+               execv(self_exe_points_to, argv);
+               xexecve("/proc/self/exe", argv, environ);
+       }
+
+       /* Allocate dynamically sized stuff */
+       max_reqnum += 2; /* account for 2 first "fake" clients */
+       if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
+       /* Since refcount is a byte, can't serve more than 255-2 clients
+        * at once. The rest will block in connect() */
+       if (max_reqnum > 0xff) max_reqnum = 0xff;
+       client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
+       busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
+       pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
+       cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
+
+       cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
+       if (cache_size < 8) cache_size = 8; /* 8*8 = 64 entries min */
+       if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
+       cache_size |= 1; /* force it to be odd */
+       cache = xzalloc(cache_size * sizeof(cache[0]));
+
+       /* Register cleanup hooks */
+       signal(SIGINT, cleanup_on_signal);
+       signal(SIGTERM, cleanup_on_signal);
+       /* Don't die if a client closes a socket on us */
+       signal(SIGPIPE, SIG_IGN);
+       /* Avoid creating zombies */
+       signal(SIGCHLD, SIG_IGN);
+#if !DEBUG_BUILD
+       /* Ensure workers don't have SIGALRM ignored */
+       signal(SIGALRM, SIG_DFL);
+#endif
+
+       if (mkdir(NSCD_DIR, 0755) == 0) {
+               /* prevent bad mode of NSCD_DIR if umask is e.g. 077 */
+               chmod(NSCD_DIR, 0755);
+       }
+       pfd[0].fd = open_socket(NSCD_SOCKET);
+       pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
+       pfd[0].events = POLLIN;
+       pfd[1].events = POLLIN;
+
+       if (debug & D_DAEMON) {
+               daemon(/*nochdir*/ 1, /*noclose*/ 0);
+               if (config.logfile) {
+                       /* nochdir=1: relative paths still work as expected */
+                       xmovefd(xopen3(config.logfile, O_WRONLY|O_CREAT|O_TRUNC, 0666), 2);
+                       debug |= D_STAMP;
+               } else {
+                       debug = 0; /* why bother? it's /dev/null'ed anyway */
+               }
+               chdir("/"); /* compat */
+               write_pid();
+               setsid();
+               /* ignore job control signals */
+               signal(SIGTTOU, SIG_IGN);
+               signal(SIGTTIN, SIG_IGN);
+               signal(SIGTSTP, SIG_IGN);
+       }
+
+       log(L_ALL, "unscd v" PROGRAM_VERSION ", debug level 0x%x", debug & L_ALL);
+       log(L_DEBUG, "max %u requests in parallel", max_reqnum - 2);
+       log(L_DEBUG, "cache size %u x 8 entries", cache_size);
+
+       if (env_U) {
+               int size;
+               gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
+               if (size > 1)
+                       if (setgroups(size - 1, &ug[1]) || setgid(ug[1]))
+                               perror_and_die("cannot set groups for user '%s'", config.user);
+               if (size > 0)
+                       if (setuid(ug[0]))
+                               perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
+               free(ug);
+       }
+
+       for (n = 0; n < 3; n++) {
+               log(L_DEBUG, "%s cache enabled:%u pttl:%u nttl:%u",
+                               srv_name[n],
+                               config.srv_enable[n],
+                               config.pttl[n],
+                               config.nttl[n]);
+               config.pttl[n] *= 1000;
+               config.nttl[n] *= 1000;
+       }
+
+       main_loop();
+
+       return 0;
+}
author	Don Armstrong <don@donarmstrong.com>
	Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)
committer	Don Armstrong <don@donarmstrong.com>
	Mon, 21 Oct 2013 22:03:13 +0000 (15:03 -0700)