From 1a1defe361c4f567e9bbc4ef4e3d55caba24f43a Mon Sep 17 00:00:00 2001
From: Don Armstrong <don@donarmstrong.com>
Date: Mon, 21 Oct 2013 15:03:13 -0700
Subject: [PATCH] Imported Upstream version 0.49

---
 nscd.c | 2610 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2610 insertions(+)
 create mode 100644 nscd.c

diff --git a/nscd.c b/nscd.c
new file mode 100644
index 0000000..23f5851
--- /dev/null
+++ b/nscd.c
@@ -0,0 +1,2610 @@
+/* This file is part of unscd, a complete nscd replacement.
+ * Copyright (C) 2007-2012 Denys Vlasenko. Licensed under the GPL version 2.
+ */
+
+/* unscd is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * unscd is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You can download the GNU General Public License from the GNU website
+ * at http://www.gnu.org/ or write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
+
+/*
+Build instructions:
+
+gcc -Wall -Wunused-parameter -Os -o nscd nscd.c
+
+gcc -fomit-frame-pointer -Wl,--sort-section -Wl,alignment -Wl,--sort-common
+      -Os -o nscd nscd.c
+
+Description:
+
+nscd problems are not exactly unheard of. Over the years, there were
+quite a bit of bugs in it. This leads people to invent babysitters
+which restart crashed/hung nscd. This is ugly.
+
+After looking at nscd source in glibc I arrived to the conclusion
+that its design is contributing to this significantly. Even if nscd's
+code is 100.00% perfect and bug-free, it can still suffer from bugs
+in libraries it calls.
+
+As designed, it's a multithreaded program which calls NSS libraries.
+These libraries are not part of libc, they may be provided
+by third-party projects (samba, ldap, you name it).
+
+Thus nscd cannot be sure that libraries it calls do not have memory
+or file descriptor leaks and other bugs.
+
+Since nscd is multithreaded program with single shared cache,
+any resource leak in any NSS library has cumulative effect.
+Even if a NSS library leaks a file descriptor 0.01% of the time,
+this will make nscd crash or hang after some time.
+
+Of course bugs in NSS .so modules should be fixed, but meanwhile
+I do want nscd which does not crash or lock up.
+
+So I went ahead and wrote a replacement.
+
+It is a single-threaded server process which offloads all NSS
+lookups to worker children (not threads, but fully independent
+processes). Cache hits are handled by parent. Only cache misses
+start worker children. This design is immune against
+resource leaks and hangs in NSS libraries.
+
+It is also many times smaller.
+
+Currently (v0.36) it emulates glibc nscd pretty closely
+(handles same command line flags and config file), and is moderately tested.
+
+Please note that as of 2008-08 it is not in wide use (yet?).
+If you have trouble compiling it, see an incompatibility with
+"standard" one or experience hangs/crashes, please report it to
+vda.linux@googlemail.com
+
+***********************************************************************/
+
+/* Make struct ucred appear in sys/socket.h */
+#define _GNU_SOURCE 1
+/* For all good things */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <time.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <grp.h>
+#include <getopt.h>
+#include <syscall.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/poll.h>
+#include <sys/un.h>
+/* For INT_MAX */
+#include <limits.h>
+/* For inet_ntoa (for debug build only) */
+#include <arpa/inet.h>
+
+/*
+ * 0.21 add SEGV reporting to worker
+ * 0.22 don't do freeaddrinfo() in GETAI worker, it's crashy
+ * 0.23 add parameter parsing
+ * 0.24 add conf file parsing, not using results yet
+ * 0.25 used some of conf file settings (not tested)
+ * 0.26 almost all conf file settings are wired up
+ * 0.27 a bit more of almost all conf file settings are wired up
+ * 0.28 optimized cache aging
+ * 0.29 implemented invalidate and shutdown options
+ * 0.30 fixed buglet (sizeof(ptr) != sizeof(array))
+ * 0.31 reduced client_info by one member
+ * 0.32 fix nttl/size defaults; simpler check for worker child in main()
+ * 0.33 tweak includes so that it builds on my new machine (64-bit userspace);
+ *      do not die on unknown service name, just warn
+ *      ("services" is a new service we don't support)
+ * 0.34 create /var/run/nscd/nscd.pid pidfile like glibc nscd 2.8 does;
+ *      delay setuid'ing itself to server-user after log and pidfile are open
+ * 0.35 readlink /proc/self/exe and use result if execing /proc/self/exe fails
+ * 0.36 excercise extreme paranoia handling server-user option;
+ *      a little bit more verbose logging:
+ *      L_DEBUG2 log level added, use debug-level 7 to get it
+ * 0.37 users reported over-zealous "detected change in /etc/passwd",
+ *      apparently stat() returns random garbage in unused padding
+ *      on some systems. Made the check less paranoid.
+ * 0.38 log POLLHUP better
+ * 0.39 log answers to client better, log getpwnam in the worker,
+ *      pass debug level value down to worker.
+ * 0.40   fix handling of shutdown and invalidate requests;
+ *        fix bug with answer written in several pieces
+ * 0.40.1 set hints.ai_socktype = SOCK_STREAM in GETAI request
+ * 0.41   eliminate double caching of two near-simultaneous identical requests -
+ *        EXPERIMENTAL
+ * 0.42   execute /proc/self/exe by link name first (better comm field)
+ * 0.43   fix off-by-one error in setgroups
+ * 0.44   make -d[ddd] bump up debug - easier to explain to users
+ *        how to produce detailed log (no nscd.conf tweaking)
+ * 0.45   Fix out-of-bounds array access and log/pid file permissions -
+ *        thanks to Sebastian Krahmer (krahmer AT suse.de)
+ * 0.46   fix a case when we forgot to remove a future entry on worker failure
+ * 0.47   fix nscd without -d to not bump debug level
+ * 0.48   fix for changes in __nss_disable_nscd API in glibc-2.15
+ * 0.49   minor tweaks to messages
+ */
+#define PROGRAM_VERSION "0.49"
+
+#define DEBUG_BUILD 1
+
+
+/*
+** Generic helpers
+*/
+
+#define ARRAY_SIZE(x) ((unsigned)(sizeof(x) / sizeof((x)[0])))
+
+#define NORETURN __attribute__ ((__noreturn__))
+
+
+#ifdef MY_CPU_HATES_CHARS
+typedef int smallint;
+#else
+typedef signed char smallint;
+#endif
+
+
+enum {
+	L_INFO   = (1 << 0),
+	L_DEBUG  = ((1 << 1) * DEBUG_BUILD),
+	L_DEBUG2 = ((1 << 2) * DEBUG_BUILD),
+	L_DUMP   = ((1 << 3) * DEBUG_BUILD),
+	L_ALL    = 0xf,
+	D_DAEMON = (1 << 6),
+	D_STAMP  = (1 << 5),
+};
+
+static smallint debug = D_DAEMON;
+
+static void verror(const char *s, va_list p, const char *strerr)
+{
+	char msgbuf[1024];
+	int sz, rem, strerr_len;
+	struct timeval tv;
+
+	sz = 0;
+	if (debug & D_STAMP) {
+		gettimeofday(&tv, NULL);
+		sz = sprintf(msgbuf, "%02u:%02u:%02u.%05u ",
+			(unsigned)((tv.tv_sec / (60*60)) % 24),
+			(unsigned)((tv.tv_sec / 60) % 60),
+			(unsigned)(tv.tv_sec % 60),
+			(unsigned)(tv.tv_usec / 10));
+	}
+	rem = sizeof(msgbuf) - sz;
+	sz += vsnprintf(msgbuf + sz, rem, s, p);
+	rem = sizeof(msgbuf) - sz; /* can be negative after this! */
+
+	if (strerr) {
+		strerr_len = strlen(strerr);
+		if (rem >= strerr_len + 4) { /* ": STRERR\n\0" */
+			msgbuf[sz++] = ':';
+			msgbuf[sz++] = ' ';
+			strcpy(msgbuf + sz, strerr);
+			sz += strerr_len;
+		}
+	}
+	if (rem >= 2) {
+		msgbuf[sz++] = '\n';
+		msgbuf[sz] = '\0';
+	}
+	fflush(NULL);
+	fputs(msgbuf, stderr);
+}
+
+static void error(const char *msg, ...)
+{
+	va_list p;
+	va_start(p, msg);
+	verror(msg, p, NULL);
+	va_end(p);
+}
+
+static void error_and_die(const char *msg, ...) NORETURN;
+static void error_and_die(const char *msg, ...)
+{
+	va_list p;
+	va_start(p, msg);
+	verror(msg, p, NULL);
+	va_end(p);
+	_exit(1);
+}
+
+static void perror_and_die(const char *msg, ...) NORETURN;
+static void perror_and_die(const char *msg, ...)
+{
+	va_list p;
+	va_start(p, msg);
+	/* Guard against "<error message>: Success" */
+	verror(msg, p, errno ? strerror(errno) : NULL);
+	va_end(p);
+	_exit(1);
+}
+
+static void nscd_log(int mask, const char *msg, ...)
+{
+	if (debug & mask) {
+		va_list p;
+		va_start(p, msg);
+		verror(msg, p, NULL);
+		va_end(p);
+	}
+}
+
+#define log(lvl, ...) do { if (lvl) nscd_log(lvl, __VA_ARGS__); } while (0)
+
+#if DEBUG_BUILD
+static void dump(const void *ptr, int len)
+{
+	char text[18];
+	const unsigned char *buf;
+	char *p;
+
+	if (!(debug & L_DUMP))
+		return;
+
+	buf = ptr;
+	while (len > 0) {
+		int chunk = ((len >= 16) ? 16 : len);
+		fprintf(stderr,
+			"%02x %02x %02x %02x %02x %02x %02x %02x "
+			"%02x %02x %02x %02x %02x %02x %02x %02x " + (16-chunk) * 5,
+			buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7],
+			buf[8], buf[9],buf[10],buf[11],buf[12],buf[13],buf[14],buf[15]
+		);
+		fprintf(stderr, "%*s", (16-chunk) * 3, "");
+		len -= chunk;
+		p = text;
+		do {
+			unsigned char c = *buf++;
+			*p++ = (c >= 32 && c < 127 ? c : '.');
+		} while (--chunk);
+		*p++ = '\n';
+		*p = '\0';
+		fputs(text, stderr);
+	}
+}
+#else
+void dump(const void *ptr, int len);
+#endif
+
+#define hex_dump(p,n) do { if (L_DUMP) dump(p,n); } while (0)
+
+static int xopen3(const char *pathname, int flags, int mode)
+{
+	int fd = open(pathname, flags, mode);
+	if (fd < 0)
+		perror_and_die("open");
+	return fd;
+}
+
+static void xpipe(int *fds)
+{
+	if (pipe(fds) < 0)
+		perror_and_die("pipe");
+}
+
+static void xexecve(const char *filename, char **argv, char **envp) NORETURN;
+static void xexecve(const char *filename, char **argv, char **envp)
+{
+	execve(filename, argv, envp);
+	perror_and_die("cannot re-exec %s", filename);
+}
+
+static void ndelay_on(int fd)
+{
+	int fl = fcntl(fd, F_GETFL);
+	if (fl < 0)
+		perror_and_die("F_GETFL");
+	if (fcntl(fd, F_SETFL, fl | O_NONBLOCK) < 0)
+		perror_and_die("setting O_NONBLOCK");
+}
+
+static void close_on_exec(int fd)
+{
+	if (fcntl(fd, F_SETFD, FD_CLOEXEC) < 0)
+		perror_and_die("setting FD_CLOEXEC");
+}
+
+static unsigned monotonic_ms(void)
+{
+	struct timespec ts;
+	if (syscall(__NR_clock_gettime, CLOCK_MONOTONIC, &ts))
+		perror_and_die("clock_gettime(MONOTONIC)");
+	return ts.tv_sec * 1000 + ts.tv_nsec / 1000000;
+}
+
+static unsigned strsize(const char *str)
+{
+	return strlen(str) + 1;
+}
+
+static unsigned strsize_aligned4(const char *str)
+{
+	return (strlen(str) + 1 + 3) & (~3);
+}
+
+static ssize_t safe_read(int fd, void *buf, size_t count)
+{
+	ssize_t n;
+	do {
+		n = read(fd, buf, count);
+	} while (n < 0 && errno == EINTR);
+	return n;
+}
+
+static ssize_t full_read(int fd, void *buf, size_t len)
+{
+	ssize_t cc;
+	ssize_t total;
+	total = 0;
+	while (len) {
+		cc = safe_read(fd, buf, len);
+		if (cc < 0)
+			return cc;      /* read() returns -1 on failure. */
+		if (cc == 0)
+			break;
+		buf = ((char *)buf) + cc;
+		total += cc;
+		len -= cc;
+	}
+	return total;
+}
+
+/* unused
+static void xsafe_read(int fd, void *buf, size_t len)
+{
+	if (len != safe_read(fd, buf, len))
+		perror_and_die("short read");
+}
+static void xfull_read(int fd, void *buf, size_t len)
+{
+	if (len != full_read(fd, buf, len))
+		perror_and_die("short read");
+}
+*/
+
+static ssize_t safe_write(int fd, const void *buf, size_t count)
+{
+	ssize_t n;
+	do {
+		n = write(fd, buf, count);
+	} while (n < 0 && errno == EINTR);
+	return n;
+}
+
+static ssize_t full_write(int fd, const void *buf, size_t len)
+{
+	ssize_t cc;
+	ssize_t total;
+
+	total = 0;
+	while (len) {
+		cc = safe_write(fd, buf, len);
+		if (cc < 0)
+			return cc;      /* write() returns -1 on failure. */
+		total += cc;
+		buf = ((const char *)buf) + cc;
+		len -= cc;
+	}
+	return total;
+}
+
+static void xsafe_write(int fd, const void *buf, size_t count)
+{
+	if (count != safe_write(fd, buf, count))
+		perror_and_die("short write of %ld bytes", (long)count);
+}
+static void xfull_write(int fd, const void *buf, size_t count)
+{
+	if (count != full_write(fd, buf, count))
+		perror_and_die("short write of %ld bytes", (long)count);
+}
+
+static void xmovefd(int from_fd, int to_fd)
+{
+	if (from_fd != to_fd) {
+		if (dup2(from_fd, to_fd) < 0)
+			perror_and_die("dup2");
+		close(from_fd);
+	}
+}
+
+static unsigned getnum(const char *str)
+{
+	if (str[0] >= '0' && str[0] <= '9') {
+		char *p;
+		unsigned long l = strtoul(str, &p, 10);
+		/* must not overflow int even after x1000 */
+		if (!*p && l <= INT_MAX / 1000)
+			return l;
+	}
+	error_and_die("malformed or too big number '%s'", str);
+};
+
+static char *skip_whitespace(const char *s)
+{
+	/* NB: isspace('\0') returns 0 */
+	while (isspace(*s)) ++s;
+	return (char *) s;
+}
+
+static char *skip_non_whitespace(const char *s)
+{
+	while (*s && !isspace(*s)) ++s;
+	return (char *) s;
+}
+
+static void *xmalloc(unsigned sz)
+{
+	void *p = malloc(sz);
+	if (!p)
+		error_and_die("out of memory");
+	return p;
+}
+
+static void *xzalloc(unsigned sz)
+{
+	void *p = xmalloc(sz);
+	memset(p, 0, sz);
+	return p;
+}
+
+static void *xrealloc(void *p, unsigned size)
+{
+	p = realloc(p, size);
+	if (!p)
+		error_and_die("out of memory");
+	return p;
+}
+
+static const char *xstrdup(const char *str)
+{
+	const char *p = strdup(str);
+	if (!p)
+		error_and_die("out of memory");
+	return p;
+}
+
+
+/*
+** Config data
+*/
+
+enum {
+	SRV_PASSWD,
+	SRV_GROUP,
+	SRV_HOSTS,
+};
+
+static const char srv_name[3][7] = {
+	"passwd",
+	"group",
+	"hosts"
+};
+
+static struct {
+	const char *logfile;
+	const char *user;
+	smallint srv_enable[3];
+	smallint check_files[3];
+	unsigned pttl[3];
+	unsigned nttl[3];
+	unsigned size[3];
+} config = {
+	/* We try to closely mimic glibc nscd */
+	.logfile     = NULL, /* default is to not have a log file */
+	.user        = NULL,
+	.srv_enable  = { 0, 0, 0 },
+	.check_files = { 1, 1, 1 },
+	.pttl        = { 3600, 3600, 3600 },
+	.nttl        = { 20, 60, 20 },
+	/* huh, what is the default cache size in glibc nscd? */
+	.size        = { 256 * 8 / 3, 256 * 8 / 3, 256 * 8 / 3 },
+};
+
+static const char default_conffile[] = "/etc/nscd.conf";
+static const char *self_exe_points_to = "/proc/self/exe";
+
+
+/*
+** Clients, workers machinery
+*/
+
+/* Header common to all requests */
+#define USER_REQ_STRUCT \
+	uint32_t version; /* Version number of the daemon interface */ \
+	uint32_t type;    /* Service requested */ \
+	uint32_t key_len; /* Key length */
+
+typedef struct user_req_header {
+	USER_REQ_STRUCT
+} user_req_header;
+
+enum {
+	NSCD_VERSION = 2,
+	MAX_USER_REQ_SIZE = 1024,
+	USER_HDR_SIZE = sizeof(user_req_header),
+	/* DNS queries time out after 20 seconds,
+	 * we will allow for a bit more */
+	WORKER_TIMEOUT_SEC = 30,
+	CLIENT_TIMEOUT_MS = 100,
+	SMALL_POLL_TIMEOUT_MS = 200,
+};
+
+typedef struct user_req {
+	union {
+		struct { /* as came from client */
+			USER_REQ_STRUCT
+		};
+		struct { /* when stored in cache, overlaps .version */
+			unsigned refcount:8;
+			/* (timestamp24 * 256) == timestamp in ms */
+			unsigned timestamp24:24;
+		};
+	};
+	char reqbuf[MAX_USER_REQ_SIZE - USER_HDR_SIZE];
+} user_req;
+
+/* Compile-time check for correct size */
+struct BUG_wrong_user_req_size {
+	char BUG_wrong_user_req_size[sizeof(user_req) == MAX_USER_REQ_SIZE ? 1 : -1];
+};
+
+enum {
+	GETPWBYNAME,
+	GETPWBYUID,
+	GETGRBYNAME,
+	GETGRBYGID,
+	GETHOSTBYNAME,
+	GETHOSTBYNAMEv6,
+	GETHOSTBYADDR,
+	GETHOSTBYADDRv6,
+	SHUTDOWN,               /* Shut the server down */
+	GETSTAT,                /* Get the server statistic */
+	INVALIDATE,             /* Invalidate one special cache */
+	GETFDPW,
+	GETFDGR,
+	GETFDHST,
+	GETAI,
+	INITGROUPS,
+	GETSERVBYNAME,
+	GETSERVBYPORT,
+	GETFDSERV,
+	LASTREQ
+};
+#if DEBUG_BUILD
+static const char *const typestr[] = {
+	"GETPWBYNAME",     /* done */
+	"GETPWBYUID",      /* done */
+	"GETGRBYNAME",     /* done */
+	"GETGRBYGID",      /* done */
+	"GETHOSTBYNAME",   /* done */
+	"GETHOSTBYNAMEv6", /* done */
+	"GETHOSTBYADDR",   /* done */
+	"GETHOSTBYADDRv6", /* done */
+	"SHUTDOWN",        /* done */
+	"GETSTAT",         /* info? */
+	"INVALIDATE",      /* done */
+	/* won't do: nscd passes a name of shmem segment
+	 * which client can map and "see" the db */
+	"GETFDPW",
+	"GETFDGR",         /* won't do */
+	"GETFDHST",        /* won't do */
+	"GETAI",           /* done */
+	"INITGROUPS",      /* done */
+	"GETSERVBYNAME",   /* prio 3 (no caching?) */
+	"GETSERVBYPORT",   /* prio 3 (no caching?) */
+	"GETFDSERV"        /* won't do */
+};
+#else
+extern const char *const typestr[];
+#endif
+static const smallint type_to_srv[] = {
+	[GETPWBYNAME     ] = SRV_PASSWD,
+	[GETPWBYUID      ] = SRV_PASSWD,
+	[GETGRBYNAME     ] = SRV_GROUP,
+	[GETGRBYGID      ] = SRV_GROUP,
+	[GETHOSTBYNAME   ] = SRV_HOSTS,
+	[GETHOSTBYNAMEv6 ] = SRV_HOSTS,
+	[GETHOSTBYADDR   ] = SRV_HOSTS,
+	[GETHOSTBYADDRv6 ] = SRV_HOSTS,
+	[GETAI           ] = SRV_HOSTS,
+	[INITGROUPS      ] = SRV_GROUP,
+};
+
+static int unsupported_ureq_type(unsigned type)
+{
+	if (type == GETAI) return 0;
+	if (type == INITGROUPS) return 0;
+	if (type == GETSTAT) return 1;
+	if (type > INVALIDATE) return 1;
+	return 0;
+}
+
+
+typedef struct client_info {
+	/* if client_fd != 0, we are waiting for the reply from worker
+	 * on pfd[i].fd, and client_fd is saved client's fd
+	 * (we need to put it back into pfd[i].fd later) */
+	int client_fd;
+	unsigned bytecnt;       /* bytes read from client */
+	unsigned bufidx;        /* buffer# in global client_buf[] */
+	unsigned started_ms;
+	unsigned respos;        /* response */
+	user_req *resptr;       /* response */
+	user_req **cache_pp;    /* cache entry address */
+	user_req *ureq;         /* request (points to client_buf[x]) */
+} client_info;
+
+static unsigned g_now_ms;
+static int min_closed = INT_MAX;
+static int cnt_closed = 0;
+static int num_clients = 2; /* two listening sockets are "clients" too */
+
+/* We read up to max_reqnum requests in parallel */
+static unsigned max_reqnum = 14;
+static int next_buf;
+static char          (*client_buf)[MAX_USER_REQ_SIZE];
+static char          *busy_cbuf;
+static struct pollfd *pfd;
+static client_info   *cinfo;
+
+/* Request, response and cache data structures:
+ *
+ * cache[] (defined later):
+ * cacheline_t cache[cache_size] array, or in other words,
+ * user_req* cache[cache_size][8] array.
+ * Every client request is hashed, hash value determines which cache[x]
+ * will have the response stored in one of its 8 elements.
+ * Cache entries have this format: request, then padding to 32 bits,
+ * then the response.
+ * Addresses in cache[x][y] may be NULL or:
+ * (&client_buf[z]) & 1: the cache miss is in progress ("future entry"):
+ * "the data is not in the cache (yet), wait for it to appear"
+ * (&client_buf[z]) & 3: the cache miss is in progress and other clients
+ * also want the same data ("shared future entry")
+ * else (non-NULL but low two bits are 0): cached data in malloc'ed block
+ *
+ * Each of these is a [max_reqnum] sized array:
+ * pfd[i] - given to poll() to wait for requests and replies.
+ *      .fd: first two pfd[i]: listening Unix domain sockets, else
+ *      .fd: open fd to a client, for reading client's request, or
+ *      .fd: open fd to a worker, to send request and get response back
+ * cinfo[i] - auxiliary client data for pfd[i]
+ *      .client_fd: open fd to a client, in case we already had read its
+ *          request and got a cache miss, and created a worker or
+ *          wait for another client's worker.
+ *          Otherwise, it's 0 and client's fd is in pfd[i].fd
+ *      .bufidx: index in client_buf[] we store client's request in
+ *      .bytecnt: size of the request
+ *      .started_ms: used to time out unresponsive clients
+ *      .respos:
+ *      .resptr:
+ *      .cache_pp: &cache[x][y] where the response is, or will be stored.
+ *      .ureq:
+ * When a client has received its reply (or otherwise closed (timeout etc)),
+ * corresponding pfd[i] and cinfo[i] are removed by shifting [i+1], [i+2] etc
+ * elements down, so that both arrays never have free holes.
+ * [num_clients] is always the first free element.
+ *
+ * Each of these also is a [max_reqnum] sized array, but indexes
+ * do not correspond directly to pfd[i] and cinfo[i]:
+ * client_buf[n][MAX_USER_REQ_SIZE] - buffers we read client requests into
+ * busy_cbuf[n] - bool flags marking busy client_buf[]
+ */
+/* Possible reductions:
+ * fd, bufidx - uint8_t
+ * started_ms -> uint16_t started_s
+ * ureq - eliminate (derivable from bufidx?)
+ */
+
+/* Are special bits 0? is it a true cached entry? */
+#define CACHED_ENTRY(p)     ( ((long)(p) & 3) == 0 )
+/* Are special bits 11? is it a shared future cache entry? */
+#define CACHE_SHARED(p)     ( ((long)(p) & 3) == 3 )
+/* Return a ptr with special bits cleared (used for accessing data) */
+#define CACHE_PTR(p)        ( (void*) ((long)(p) & ~(long)3) )
+/* Return a ptr with special bits set to x1: make future cache entry ptr */
+#define MAKE_FUTURE_PTR(p)  ( (void*) ((long)(p) | 1) )
+/* Modify ptr, set special bits to 11: shared future cache entry */
+#define MARK_PTR_SHARED(pp) ( *(long*)(pp) |= 3 )
+
+static inline unsigned ureq_size(const user_req *ureq)
+{
+	return sizeof(user_req_header) + ureq->key_len;
+}
+
+static unsigned cache_age(const user_req *ureq)
+{
+	if (!CACHED_ENTRY(ureq))
+		return 0;
+	return (uint32_t) (g_now_ms - (ureq->timestamp24 << 8));
+}
+
+static void set_cache_timestamp(user_req *ureq)
+{
+	ureq->timestamp24 = g_now_ms >> 8;
+}
+
+static int alloc_buf_no(void)
+{
+	int n = next_buf;
+	do {
+		int cur = next_buf;
+		next_buf = (next_buf + 1) % max_reqnum;
+		if (!busy_cbuf[cur]) {
+			busy_cbuf[cur] = 1;
+			return cur;
+		}
+	} while (next_buf != n);
+	error_and_die("no free bufs?!");
+}
+
+static inline void *bufno2buf(int i)
+{
+	return client_buf[i];
+}
+
+static void close_client(unsigned i)
+{
+	log(L_DEBUG, "closing client %u (fd %u,%u)", i, pfd[i].fd, cinfo[i].client_fd);
+	/* Paranoia. We had nasty bugs where client was closed twice. */
+	if (pfd[i].fd == 0) ////
+		return;
+	close(pfd[i].fd);
+	if (cinfo[i].client_fd && cinfo[i].client_fd != pfd[i].fd)
+		close(cinfo[i].client_fd);
+	pfd[i].fd = 0; /* flag as unused (coalescing needs this) */
+	busy_cbuf[cinfo[i].bufidx] = 0;
+	cnt_closed++;
+	if (i < min_closed)
+		min_closed = i;
+}
+
+
+/*
+** nscd API <-> C API conversion
+*/
+
+typedef struct response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	char body[0];
+} response_header;
+
+typedef struct initgr_response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	int32_t ngrps;
+	/* code assumes gid_t == int32, let's check that */
+	int32_t gid[sizeof(gid_t) == sizeof(int32_t) ? 0 : -1];
+	/* char user_str[as_needed]; */
+} initgr_response_header;
+
+static initgr_response_header *obtain_initgroups(const char *username)
+{
+	struct initgr_response_header *resp;
+	struct passwd *pw;
+	enum { MAGIC_OFFSET = sizeof(*resp) / sizeof(int32_t) };
+	unsigned sz;
+	int ngroups;
+
+	pw = getpwnam(username);
+	if (!pw) {
+		resp = xzalloc(8);
+		resp->version_or_size = sizeof(*resp);
+		/*resp->found = 0;*/
+		/*resp->ngrps = 0;*/
+		goto ret;
+	}
+
+	/* getgrouplist may be very expensive, it's much better to allocate
+	 * a bit more than to run getgrouplist twice */
+	ngroups = 128;
+	resp = NULL;
+	do {
+		sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
+		resp = xrealloc(resp, sz);
+	} while (getgrouplist(username, pw->pw_gid, (gid_t*) &resp->gid, &ngroups) == -1);
+	log(L_DEBUG, "ngroups=%d", ngroups);
+
+	sz = sizeof(*resp) + sizeof(resp->gid[0]) * ngroups;
+	/* resp = xrealloc(resp, sz); - why bother */
+	resp->version_or_size = sz;
+	resp->found = 1;
+	resp->ngrps = ngroups;
+ ret:
+	return resp;
+}
+
+typedef struct pw_response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	int32_t pw_name_len;
+	int32_t pw_passwd_len;
+	int32_t pw_uid;
+	int32_t pw_gid;
+	int32_t pw_gecos_len;
+	int32_t pw_dir_len;
+	int32_t pw_shell_len;
+	/* char pw_name[pw_name_len]; */
+	/* char pw_passwd[pw_passwd_len]; */
+	/* char pw_gecos[pw_gecos_len]; */
+	/* char pw_dir[pw_dir_len]; */
+	/* char pw_shell[pw_shell_len]; */
+} pw_response_header;
+
+static pw_response_header *marshal_passwd(struct passwd *pw)
+{
+	char *p;
+	pw_response_header *resp;
+	unsigned pw_name_len;
+	unsigned pw_passwd_len;
+	unsigned pw_gecos_len;
+	unsigned pw_dir_len;
+	unsigned pw_shell_len;
+	unsigned sz = sizeof(*resp);
+	if (pw) {
+		sz += (pw_name_len = strsize(pw->pw_name));
+		sz += (pw_passwd_len = strsize(pw->pw_passwd));
+		sz += (pw_gecos_len = strsize(pw->pw_gecos));
+		sz += (pw_dir_len = strsize(pw->pw_dir));
+		sz += (pw_shell_len = strsize(pw->pw_shell));
+	}
+	resp = xzalloc(sz);
+	resp->version_or_size = sz;
+	if (!pw) {
+		/*resp->found = 0;*/
+		goto ret;
+	}
+	resp->found = 1;
+	resp->pw_name_len = pw_name_len;
+	resp->pw_passwd_len = pw_passwd_len;
+	resp->pw_uid = pw->pw_uid;
+	resp->pw_gid = pw->pw_gid;
+	resp->pw_gecos_len = pw_gecos_len;
+	resp->pw_dir_len = pw_dir_len;
+	resp->pw_shell_len = pw_shell_len;
+	p = (char*)(resp + 1);
+	strcpy(p, pw->pw_name); p += pw_name_len;
+	strcpy(p, pw->pw_passwd); p += pw_passwd_len;
+	strcpy(p, pw->pw_gecos); p += pw_gecos_len;
+	strcpy(p, pw->pw_dir); p += pw_dir_len;
+	strcpy(p, pw->pw_shell); p += pw_shell_len;
+	log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+	return resp;
+}
+
+typedef struct gr_response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	int32_t gr_name_len;    /* strlen(gr->gr_name) + 1; */
+	int32_t gr_passwd_len;  /* strlen(gr->gr_passwd) + 1; */
+	int32_t gr_gid;         /* gr->gr_gid */
+	int32_t gr_mem_cnt;     /* while (gr->gr_mem[gr_mem_cnt]) ++gr_mem_cnt; */
+	/* int32_t gr_mem_len[gr_mem_cnt]; */
+	/* char gr_name[gr_name_len]; */
+	/* char gr_passwd[gr_passwd_len]; */
+	/* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
+	/* char gr_gid_str[as_needed]; - huh? */
+	/* char orig_key[as_needed]; - needed?? I don't do this ATM... */
+/*
+ glibc adds gr_gid_str, but client doesn't get/use it:
+ writev(3, [{"\2\0\0\0\2\0\0\0\5\0\0\0", 12}, {"root\0", 5}], 2) = 17
+ poll([{fd=3, events=POLLIN|POLLERR|POLLHUP, revents=POLLIN}], 1, 5000) = 1
+ read(3, "\2\0\0\0\1\0\0\0\10\0\0\0\4\0\0\0\0\0\0\0\0\0\0\0", 24) = 24
+ readv(3, [{"", 0}, {"root\0\0\0\0\0\0\0\0", 12}], 2) = 12
+ read(3, NULL, 0)        = 0
+*/
+} gr_response_header;
+
+static gr_response_header *marshal_group(struct group *gr)
+{
+	char *p;
+	gr_response_header *resp;
+	unsigned gr_mem_cnt;
+	unsigned sz = sizeof(*resp);
+	if (gr) {
+		sz += strsize(gr->gr_name);
+		sz += strsize(gr->gr_passwd);
+		gr_mem_cnt = 0;
+		while (gr->gr_mem[gr_mem_cnt]) {
+			sz += strsize(gr->gr_mem[gr_mem_cnt]);
+			gr_mem_cnt++;
+		}
+		/* for int32_t gr_mem_len[gr_mem_cnt]; */
+		sz += gr_mem_cnt * sizeof(int32_t);
+	}
+	resp = xzalloc(sz);
+	resp->version_or_size = sz;
+	if (!gr) {
+		/*resp->found = 0;*/
+		goto ret;
+	}
+	resp->found = 1;
+	resp->gr_name_len = strsize(gr->gr_name);
+	resp->gr_passwd_len = strsize(gr->gr_passwd);
+	resp->gr_gid = gr->gr_gid;
+	resp->gr_mem_cnt = gr_mem_cnt;
+	p = (char*)(resp + 1);
+/* int32_t gr_mem_len[gr_mem_cnt]; */
+	gr_mem_cnt = 0;
+	while (gr->gr_mem[gr_mem_cnt]) {
+		*(uint32_t*)p = strsize(gr->gr_mem[gr_mem_cnt]);
+		p += 4;
+		gr_mem_cnt++;
+	}
+/* char gr_name[gr_name_len]; */
+	strcpy(p, gr->gr_name);
+	p += strsize(gr->gr_name);
+/* char gr_passwd[gr_passwd_len]; */
+	strcpy(p, gr->gr_passwd);
+	p += strsize(gr->gr_passwd);
+/* char gr_mem[gr_mem_cnt][gr_mem_len[i]]; */
+	gr_mem_cnt = 0;
+	while (gr->gr_mem[gr_mem_cnt]) {
+		strcpy(p, gr->gr_mem[gr_mem_cnt]);
+		p += strsize(gr->gr_mem[gr_mem_cnt]);
+		gr_mem_cnt++;
+	}
+	log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+	return resp;
+}
+
+typedef struct hst_response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	int32_t h_name_len;
+	int32_t h_aliases_cnt;
+	int32_t h_addrtype;     /* AF_INET or AF_INET6 */
+	int32_t h_length;       /* 4 or 16 */
+	int32_t h_addr_list_cnt;
+	int32_t error;
+	/* char h_name[h_name_len]; - we pad it to 4 bytes */
+	/* uint32_t h_aliases_len[h_aliases_cnt]; */
+	/* char h_addr_list[h_addr_list_cnt][h_length]; - every one is the same size [h_length] (4 or 16) */
+	/* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
+} hst_response_header;
+
+static hst_response_header *marshal_hostent(struct hostent *h)
+{
+	char *p;
+	hst_response_header *resp;
+	unsigned h_name_len;
+	unsigned h_aliases_cnt;
+	unsigned h_addr_list_cnt;
+	unsigned sz = sizeof(*resp);
+	if (h) {
+/* char h_name[h_name_len] */
+		sz += h_name_len = strsize_aligned4(h->h_name);
+		h_addr_list_cnt = 0;
+		while (h->h_addr_list[h_addr_list_cnt]) {
+			h_addr_list_cnt++;
+		}
+/* char h_addr_list[h_addr_list_cnt][h_length] */
+		sz += h_addr_list_cnt * h->h_length;
+		h_aliases_cnt = 0;
+		while (h->h_aliases[h_aliases_cnt]) {
+/* char h_aliases[h_aliases_cnt][h_aliases_len[i]] */
+			sz += strsize(h->h_aliases[h_aliases_cnt]);
+			h_aliases_cnt++;
+		}
+/* uint32_t h_aliases_len[h_aliases_cnt] */
+		sz += h_aliases_cnt * 4;
+	}
+	resp = xzalloc(sz);
+	resp->version_or_size = sz;
+	if (!h) {
+		/*resp->found = 0;*/
+		resp->error = HOST_NOT_FOUND;
+		goto ret;
+	}
+	resp->found = 1;
+	resp->h_name_len = h_name_len;
+	resp->h_aliases_cnt = h_aliases_cnt;
+	resp->h_addrtype = h->h_addrtype;
+	resp->h_length = h->h_length;
+	resp->h_addr_list_cnt = h_addr_list_cnt;
+	/*resp->error = 0;*/
+	p = (char*)(resp + 1);
+/* char h_name[h_name_len]; */
+	strcpy(p, h->h_name);
+	p += h_name_len;
+/* uint32_t h_aliases_len[h_aliases_cnt]; */
+	h_aliases_cnt = 0;
+	while (h->h_aliases[h_aliases_cnt]) {
+		*(uint32_t*)p = strsize(h->h_aliases[h_aliases_cnt]);
+		p += 4;
+		h_aliases_cnt++;
+	}
+/* char h_addr_list[h_addr_list_cnt][h_length]; */
+	h_addr_list_cnt = 0;
+	while (h->h_addr_list[h_addr_list_cnt]) {
+		memcpy(p, h->h_addr_list[h_addr_list_cnt], h->h_length);
+		p += h->h_length;
+		h_addr_list_cnt++;
+	}
+/* char h_aliases[h_aliases_cnt][h_aliases_len[i]]; */
+	h_aliases_cnt = 0;
+	while (h->h_aliases[h_aliases_cnt]) {
+		strcpy(p, h->h_aliases[h_aliases_cnt]);
+		p += strsize(h->h_aliases[h_aliases_cnt]);
+		h_aliases_cnt++;
+	}
+	log(L_DEBUG, "sz:%u realsz:%u", sz, p - (char*)resp);
+ ret:
+	return resp;
+}
+
+/* Reply to addrinfo query */
+typedef struct ai_response_header {
+	uint32_t version_or_size;
+	int32_t found;
+	int32_t naddrs;
+	int32_t addrslen;
+	int32_t canonlen;
+	int32_t error;
+	/* char ai_addr[naddrs][4 or 16]; - addrslen bytes in total */
+	/* char ai_family[naddrs]; - AF_INET[6] each (determines ai_addr[i] length) */
+	/* char ai_canonname[canonlen]; */
+} ai_response_header;
+
+static ai_response_header *obtain_addrinfo(const char *hostname)
+{
+	struct addrinfo hints;
+	struct addrinfo *ai;
+	struct addrinfo *ap;
+	ai_response_header *resp;
+	char *p, *family;
+	int err;
+	unsigned sz;
+	unsigned naddrs = 0;
+	unsigned addrslen = 0;
+	unsigned canonlen = 0;
+
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_flags = AI_CANONNAME;
+	/* kills dups (one for each possible SOCK_xxx) */
+	/* this matches glibc behavior */
+	hints.ai_socktype = SOCK_STREAM;
+	ai = NULL; /* on failure getaddrinfo may leave it as-is */
+	err = getaddrinfo(hostname, NULL, &hints, &ai);
+
+	sz = sizeof(*resp);
+	if (!err) {
+		if (ai->ai_canonname)
+			sz += canonlen = strsize(ai->ai_canonname);
+		ap = ai;
+		do {
+			naddrs++;
+			addrslen += (ap->ai_family == AF_INET ? 4 : 16);
+			ap = ap->ai_next;
+		} while (ap);
+		sz += naddrs + addrslen;
+	}
+	resp = xzalloc(sz);
+	resp->version_or_size = sz;
+	resp->error = err;
+	if (err) {
+		/*resp->found = 0;*/
+		goto ret;
+	}
+	resp->found = 1;
+	resp->naddrs = naddrs;
+	resp->addrslen = addrslen;
+	resp->canonlen = canonlen;
+	p = (char*)(resp + 1);
+	family = p + addrslen;
+	ap = ai;
+	do {
+/* char ai_family[naddrs]; */
+		*family++ = ap->ai_family;
+/* char ai_addr[naddrs][4 or 16]; */
+		if (ap->ai_family == AF_INET) {
+			memcpy(p, &(((struct sockaddr_in*)(ap->ai_addr))->sin_addr), 4);
+			p += 4;
+		} else {
+			memcpy(p, &(((struct sockaddr_in6*)(ap->ai_addr))->sin6_addr), 16);
+			p += 16;
+		}
+		ap = ap->ai_next;
+	} while (ap);
+/* char ai_canonname[canonlen]; */
+	if (ai->ai_canonname)
+		strcpy(family, ai->ai_canonname);
+	log(L_DEBUG, "sz:%u realsz:%u", sz, family + strsize(ai->ai_canonname) - (char*)resp);
+ ret:
+	/* glibc 2.3.6 segfaults here sometimes
+	 * (maybe my mistake, fixed by "ai = NULL;" above).
+	 * Since we are in worker and are going to exit anyway, why bother? */
+	/*freeaddrinfo(ai);*/
+	return resp;
+}
+
+
+/*
+** Cache management
+*/
+
+/* one 8-element "cacheline" */
+typedef user_req *cacheline_t[8];
+static unsigned cache_size;
+/* Points to cacheline_t  cache[cache_size] array, or in other words,
+ * points to user_req*    cache[cache_size][8] array */
+static cacheline_t *cache;
+static unsigned cached_cnt;
+static unsigned cache_access_cnt = 1; /* prevent division by zero */
+static unsigned cache_hit_cnt = 1;
+static unsigned last_age_time;
+static unsigned aging_interval_ms;
+static unsigned min_aging_interval_ms;
+
+static response_header *ureq_response(user_req *ureq)
+{
+	/* Skip query part, find answer part
+	 * (answer is 32-bit aligned) */
+	return (void*) ((char*)ureq + ((ureq_size(ureq) + 3) & ~3));
+}
+
+/* This hash is supposed to be good for short textual data */
+static uint32_t bernstein_hash(void *p, unsigned sz, uint32_t hash)
+{
+	uint8_t *key = p;
+	do {
+		hash = (32 * hash + hash) ^ *key++;
+	} while (--sz);
+	return hash;
+}
+
+static void free_refcounted_ureq(user_req **ureqp)
+{
+	user_req *ureq = *ureqp;
+
+	if (!CACHED_ENTRY(ureq))
+		return;
+
+	if (ureq->refcount) {
+		ureq->refcount--;
+	} else {
+		log(L_DEBUG2, "refcount == 0, free(%p)", ureq);
+		free(ureq);
+	}
+	*ureqp = NULL;
+}
+
+static user_req **lookup_in_cache(user_req *ureq)
+{
+	user_req **cacheline;
+	int free_cache;
+	unsigned hash;
+	unsigned i;
+	unsigned ureq_sz = ureq_size(ureq);
+
+	/* prevent overflow and division by zero */
+	cache_access_cnt++;
+	if ((int)cache_access_cnt < 0) {
+		cache_access_cnt = (cache_access_cnt >> 1) + 1;
+		cache_hit_cnt = (cache_hit_cnt >> 1) + 1;
+	}
+
+	hash = bernstein_hash(&ureq->key_len, ureq_sz - offsetof(user_req, key_len), ureq->type);
+	log(L_DEBUG2, "hash:%08x", hash);
+	hash = hash % cache_size;
+	cacheline = cache[hash];
+
+	free_cache = -1;
+	for (i = 0; i < 8; i++) {
+		user_req *cached = CACHE_PTR(cacheline[i]);
+		if (!cached) {
+			if (free_cache == -1)
+				free_cache = i;
+			continue;
+		}
+		/* ureq->version is always 2 and is reused in cache
+		 * for other purposes, we need to skip it here */
+		if (memcmp(&ureq->type, &cached->type, ureq_sz - offsetof(user_req, type)) == 0) {
+			log(L_DEBUG, "found in cache[%u][%u]", hash, i);
+			cache_hit_cnt++;
+			return &cacheline[i];
+		}
+	}
+
+	if (free_cache >= 0) {
+		cached_cnt++;
+		i = free_cache;
+		log(L_DEBUG, "not found, using free cache[%u][%u]", hash, i);
+		goto ret;
+	}
+
+	unsigned oldest_idx = 0;
+	unsigned oldest_age = 0;
+	for (i = 0; i < 8; i++) {
+		unsigned age = cache_age(cacheline[i]);
+		if (age > oldest_age) {
+			oldest_age = age;
+			oldest_idx = i;
+		}
+	}
+	if (oldest_age == 0) {
+		/* All entries in cacheline are "future" entries!
+		 * This is very unlikely, but we must still work correctly.
+		 * We call this "fake cache entry".
+		 * The data will be "cached" only for the duration
+		 * of this client's request lifetime.
+		 */
+		log(L_DEBUG, "not found, and cache[%u] is full: using fake cache entry", hash);
+		return NULL;
+	}
+	i = oldest_idx;
+	log(L_DEBUG, "not found, freeing and reusing cache[%u][%u] (age %u)", hash, i, oldest_age);
+	free_refcounted_ureq(&cacheline[i]);
+
+ ret:
+	cacheline[i] = MAKE_FUTURE_PTR(ureq);
+	return &cacheline[i];
+}
+
+static void age_cache(unsigned free_all, int srv)
+{
+	user_req **cp = *cache;
+	int i;
+	unsigned sv = cached_cnt;
+
+	log(L_DEBUG, "aging cache, srv:%d, free_all:%u", srv, free_all);
+	if (srv == -1 || free_all)
+		aging_interval_ms = INT_MAX;
+	i = cache_size * 8;
+	do {
+		user_req *cached = *cp;
+		if (CACHED_ENTRY(cached) && cached != NULL) {
+			int csrv = type_to_srv[cached->type];
+			if (srv == -1 || srv == csrv) {
+				if (free_all) {
+					cached_cnt--;
+					free_refcounted_ureq(cp);
+				} else {
+                        		unsigned age = cache_age(cached);
+					response_header *resp = ureq_response(cached);
+					unsigned ttl = (resp->found ? config.pttl : config.nttl)[csrv];
+					if (age >= ttl) {
+						log(L_DEBUG2, "freeing: age %u positive %d ttl %u", age, resp->found, ttl);
+						cached_cnt--;
+						free_refcounted_ureq(cp);
+					} else if (srv == -1) {
+						ttl -= age;
+						if (aging_interval_ms > ttl)
+							aging_interval_ms = ttl;
+					}
+				}
+			}
+		}
+		cp++;
+	} while (--i);
+	log(L_INFO, "aged cache, freed:%u, remain:%u", sv - cached_cnt, cached_cnt);
+	log(L_DEBUG2, "aging interval now %u ms", aging_interval_ms);
+}
+
+
+/*
+** Worker child
+*/
+
+/* Spawns a worker and feeds it with user query on stdin */
+/* Returns stdout fd of the worker, in blocking mode */
+static int create_and_feed_worker(user_req *ureq)
+{
+	pid_t pid;
+	struct {
+		int rd;
+		int wr;
+	} to_child, to_parent;
+
+	/* NB: these pipe fds are in blocking mode and non-CLOEXECed */
+	xpipe(&to_child.rd);
+	xpipe(&to_parent.rd);
+
+	pid = vfork();
+	if (pid < 0) /* error */
+		perror_and_die("vfork");
+	if (!pid) { /* child */
+		char param[sizeof(int)*3 + 2];
+		char *argv[3];
+
+		close(to_child.wr);
+		close(to_parent.rd);
+		xmovefd(to_child.rd, 0);
+		xmovefd(to_parent.wr, 1);
+		sprintf(param, "%u", debug);
+		argv[0] = (char*) "worker_nscd";
+		argv[1] = param;
+		argv[2] = NULL;
+		/* Re-exec ourself, cleaning up all allocated memory.
+		 * fds in parent are marked CLOEXEC and will be closed too
+		 * (modulo bugs) */
+		/* Try link name first: it's better to have comm field
+		 * of "nscd" than "exe" (pgrep reported to fail to find us
+		 * by name when comm field contains "exe") */
+		execve(self_exe_points_to, argv, argv+2);
+		xexecve("/proc/self/exe", argv, argv+2);
+	}
+
+	/* parent */
+	close(to_child.rd);
+	close(to_parent.wr);
+	/* We do not expect child to block for any noticeably long time,
+	 * and also we expect write to be one-piece one:
+	 * ureq size is <= 1k and pipes are guaranteed to accept
+	 * at least PIPE_BUF at once */
+	xsafe_write(to_child.wr, ureq, ureq_size(ureq));
+
+	close(to_child.wr);
+	close_on_exec(to_parent.rd);
+	return to_parent.rd;
+}
+
+static user_req *worker_ureq;
+
+#if DEBUG_BUILD
+static const char *req_str(unsigned type, const char *buf)
+{
+	if (type == GETHOSTBYADDR) {
+		struct in_addr in;
+		in.s_addr = *((uint32_t*)buf);
+		return inet_ntoa(in);
+	}
+	if (type == GETHOSTBYADDRv6) {
+		return "IPv6";
+	}
+	return buf;
+}
+#else
+const char *req_str(unsigned type, const char *buf);
+#endif
+
+static void worker_signal_handler(int sig)
+{
+#if DEBUG_BUILD
+	log(L_INFO, "worker:%d got sig:%d while handling req "
+		"type:%d(%s) key_len:%d '%s'",
+		getpid(), sig,
+		worker_ureq->type, typestr[worker_ureq->type],
+		worker_ureq->key_len,
+		req_str(worker_ureq->type, worker_ureq->reqbuf)
+	);
+#else
+	log(L_INFO, "worker:%d got sig:%d while handling req "
+		"type:%d key_len:%d",
+		getpid(), sig,
+		worker_ureq->type, worker_ureq->key_len);
+#endif
+	_exit(0);
+}
+
+static void worker(const char *param) NORETURN;
+static void worker(const char *param)
+{
+	user_req ureq;
+	void *resp;
+
+	debug = atoi(param);
+
+	worker_ureq = &ureq; /* for signal handler */
+
+	/* Make sure we won't hang, but rather die */
+	if (WORKER_TIMEOUT_SEC)
+		alarm(WORKER_TIMEOUT_SEC);
+
+	/* NB: fds 0, 1 are in blocking mode */
+
+	/* We block here (for a short time) */
+	/* Due to ureq size < PIPE_BUF read is atomic */
+	/* No error or size checking: we trust the parent */
+	safe_read(0, &ureq, sizeof(ureq));
+
+	signal(SIGSEGV,   worker_signal_handler);
+	signal(SIGBUS,    worker_signal_handler);
+	signal(SIGILL,    worker_signal_handler);
+	signal(SIGFPE,    worker_signal_handler);
+	signal(SIGABRT,   worker_signal_handler);
+#ifdef SIGSTKFLT
+	signal(SIGSTKFLT, worker_signal_handler);
+#endif
+
+	if (ureq.type == GETHOSTBYNAME
+	 || ureq.type == GETHOSTBYNAMEv6
+	) {
+		resp = marshal_hostent(
+			ureq.type == GETHOSTBYNAME
+			? gethostbyname(ureq.reqbuf)
+			: gethostbyname2(ureq.reqbuf, AF_INET6)
+		);
+	} else if (ureq.type == GETHOSTBYADDR
+	 || ureq.type == GETHOSTBYADDRv6
+	) {
+		resp = marshal_hostent(gethostbyaddr(ureq.reqbuf, ureq.key_len,
+			(ureq.type == GETHOSTBYADDR ? AF_INET : AF_INET6)
+		));
+	} else if (ureq.type == GETPWBYNAME) {
+		struct passwd *pw;
+		log(L_DEBUG2, "getpwnam('%s')", ureq.reqbuf);
+		pw = getpwnam(ureq.reqbuf);
+		log(L_DEBUG2, "getpwnam result:%p", pw);
+		resp = marshal_passwd(pw);
+	} else if (ureq.type == GETPWBYUID) {
+		resp = marshal_passwd(getpwuid(atoi(ureq.reqbuf)));
+	} else if (ureq.type == GETGRBYNAME) {
+		struct group *gr = getgrnam(ureq.reqbuf);
+		resp = marshal_group(gr);
+	} else if (ureq.type == GETGRBYGID) {
+		struct group *gr = getgrgid(atoi(ureq.reqbuf));
+		resp = marshal_group(gr);
+	} else if (ureq.type == GETAI) {
+		resp = obtain_addrinfo(ureq.reqbuf);
+	} else /*if (ureq.type == INITGROUPS)*/ {
+		resp = obtain_initgroups(ureq.reqbuf);
+	}
+
+	if (!((response_header*)resp)->found) {
+		/* Parent knows about this special case */
+		xfull_write(1, resp, 8);
+	} else {
+		/* Responses can be big (getgrnam("guest") on a big user db),
+		 * we cannot rely on them being atomic. full_write loops
+		 * if needed */
+		xfull_write(1, resp, ((response_header*)resp)->version_or_size);
+	}
+	_exit(0);
+}
+
+
+/*
+** Main loop
+*/
+
+static const char checked_filenames[][sizeof("/etc/passwd")] = {
+	[SRV_PASSWD] = "/etc/passwd", /*  "/etc/shadow"? */
+	[SRV_GROUP]  = "/etc/group",
+	[SRV_HOSTS]  = "/etc/hosts", /* "/etc/resolv.conf" "/etc/nsswitch.conf"? */
+};
+
+static long checked_status[ARRAY_SIZE(checked_filenames)];
+
+static void check_files(int srv)
+{
+	struct stat tsb;
+	const char *file = checked_filenames[srv];
+	long v;
+
+	memset(&tsb, 0, sizeof(tsb));
+	stat(file, &tsb); /* ignore errors */
+	/* Comparing struct stat's was giving false positives.
+	 * Extracting only those fields which are interesting: */
+	v = (long)tsb.st_mtime ^ (long)tsb.st_size ^ (long)tsb.st_ino; /* ^ (long)tsb.st_dev ? */
+
+	if (v != checked_status[srv]) {
+		checked_status[srv] = v;
+		log(L_INFO, "detected change in %s", file);
+		age_cache(/*free_all:*/ 1, srv);
+	}
+}
+
+/* Returns 1 if we immediately have the answer */
+static int handle_client(int i)
+{
+	int srv;
+	user_req *ureq = cinfo[i].ureq;
+	user_req **cache_pp;
+	user_req *ureq_and_resp;
+
+#if DEBUG_BUILD
+	log(L_DEBUG, "version:%d type:%d(%s) key_len:%d '%s'",
+			ureq->version, ureq->type,
+			ureq->type < ARRAY_SIZE(typestr) ? typestr[ureq->type] : "?",
+			ureq->key_len, req_str(ureq->type, ureq->reqbuf));
+#endif
+
+	if (ureq->version != NSCD_VERSION) {
+		log(L_INFO, "wrong version");
+		close_client(i);
+		return 0;
+	}
+	if (ureq->key_len > sizeof(ureq->reqbuf)) {
+		log(L_INFO, "bogus key_len %u - ignoring", ureq->key_len);
+		close_client(i);
+		return 0;
+	}
+	if (cinfo[i].bytecnt < USER_HDR_SIZE + ureq->key_len) {
+		log(L_INFO, "read %d, need to read %d",
+			cinfo[i].bytecnt, USER_HDR_SIZE + ureq->key_len);
+		return 0; /* more to read */
+	}
+	if (cinfo[i].bytecnt > USER_HDR_SIZE + ureq->key_len) {
+		log(L_INFO, "read overflow: %u > %u",
+			(int)cinfo[i].bytecnt, (int)(USER_HDR_SIZE + ureq->key_len));
+		close_client(i);
+		return 0;
+	}
+	if (unsupported_ureq_type(ureq->type)) {
+		/* We don't know this request. Just close the connection.
+		 * (glibc client interprets this like "not supported by this nscd")
+		 * Happens very often, thus DEBUG, not INFO */
+		log(L_DEBUG, "unsupported query, dropping");
+		close_client(i);
+		return 0;
+	}
+	srv = type_to_srv[ureq->type];
+	if (!config.srv_enable[srv]) {
+		log(L_INFO, "service %d is disabled, dropping", srv);
+		close_client(i);
+		return 0;
+	}
+
+	hex_dump(cinfo[i].ureq, cinfo[i].bytecnt);
+
+	if (ureq->type == SHUTDOWN
+	 || ureq->type == INVALIDATE
+	) {
+#ifdef SO_PEERCRED
+		struct ucred caller;
+		socklen_t optlen = sizeof(caller);
+		if (getsockopt(pfd[i].fd, SOL_SOCKET, SO_PEERCRED, &caller, &optlen) < 0) {
+			log(L_INFO, "ignoring special request - cannot get caller's id: %s", strerror(errno));
+			close_client(i);
+			return 0;
+		}
+		if (caller.uid != 0) {
+			log(L_INFO, "special request from non-root - ignoring");
+			close_client(i);
+			return 0;
+		}
+#endif
+		if (ureq->type == SHUTDOWN) {
+			log(L_INFO, "got shutdown request, exiting");
+			exit(0);
+		}
+		if (!ureq->key_len || ureq->reqbuf[ureq->key_len - 1]) {
+			log(L_INFO, "malformed invalidate request - ignoring");
+			close_client(i);
+			return 0;
+		}
+		log(L_INFO, "got invalidate request, flushing cache");
+		/* Frees entire cache. TODO: replace -1 with service (in ureq->reqbuf) */
+		age_cache(/*free_all:*/ 1, -1);
+		close_client(i);
+		return 0;
+	}
+
+	if (ureq->type != GETHOSTBYADDR
+	 && ureq->type != GETHOSTBYADDRv6
+	) {
+		if (ureq->key_len && ureq->reqbuf[ureq->key_len - 1] != '\0') {
+			log(L_INFO, "badly terminated buffer");
+			close_client(i);
+			return 0;
+		}
+	}
+
+	if (config.check_files[srv]) {
+		check_files(srv);
+	}
+
+	cache_pp = lookup_in_cache(ureq);
+	ureq_and_resp = cache_pp ? *cache_pp : NULL;
+
+	if (ureq_and_resp) {
+		if (CACHED_ENTRY(ureq_and_resp)) {
+			/* Found. Save ptr to response into cinfo and return */
+			response_header *resp = ureq_response(ureq_and_resp);
+			unsigned sz = resp->version_or_size;
+
+			log(L_DEBUG, "sz:%u", sz);
+			hex_dump(resp, sz);
+			ureq_and_resp->refcount++; /* cache shouldn't free it under us! */
+			pfd[i].events = POLLOUT; /* we want to write out */
+			cinfo[i].resptr = ureq_and_resp;
+			/*cinfo[i].respos = 0; - already is */
+			/* prevent future matches with anything */
+			cinfo[i].cache_pp = (void *) 1;
+			return 1; /* "ready to write data out to client" */
+		}
+
+		/* Not found. Remember a pointer where it will appear */
+		cinfo[i].cache_pp = cache_pp;
+
+		/* If it does not point to our own ureq buffer... */
+		if (CACHE_PTR(ureq_and_resp) != ureq) {
+			/* We are not the first client who wants this */
+			log(L_DEBUG, "another request is in progress (%p), waiting for its result", ureq_and_resp);
+			MARK_PTR_SHARED(cache_pp); /* "please inform us when it's ready" */
+			/* "we do not wait for client anymore" */
+			cinfo[i].client_fd = pfd[i].fd;
+			/* Don't wait on fd. Worker response will unblock us */
+			pfd[i].events = 0;
+			return 0;
+		}
+		/* else: lookup_in_cache inserted (ureq & 1) into *cache_pp:
+		 * we are the first client to miss on this ureq. */
+	}
+
+	/* Start worker thread */
+	log(L_DEBUG, "stored %p in cache, starting a worker", ureq_and_resp);
+	/* Now we will wait on worker's fd, not client's! */
+	cinfo[i].client_fd = pfd[i].fd;
+	pfd[i].fd = create_and_feed_worker(ureq);
+	return 0;
+}
+
+static void prepare_for_writeout(unsigned i, user_req *cached)
+{
+	log(L_DEBUG2, "client %u: data is ready at %p", i, cached);
+
+	if (cinfo[i].client_fd) {
+		pfd[i].fd = cinfo[i].client_fd;
+		cinfo[i].client_fd = 0; /* "we don't wait for worker reply" */
+	}
+	pfd[i].events = POLLOUT;
+
+	/* Writeout position etc */
+	cinfo[i].resptr = cached;
+	/*cinfo[i].respos = 0; - already is */
+	/* if worker took some time to get info (e.g. DNS query),
+	 * prevent client timeout from triggering at once */
+	cinfo[i].started_ms = g_now_ms;
+}
+
+/* Worker seems to be ready to write the response.
+ * When we return, response is fully read and stored in cache,
+ * worker's fd is closed, pfd[i] and cinfo[i] are updated. */
+static void handle_worker_response(int i)
+{
+	struct { /* struct response_header + small body */
+		uint32_t version_or_size;
+		int32_t found;
+		char body[256 - 8];
+	} sz_and_found;
+	user_req *cached;
+	user_req *ureq;
+	response_header *resp;
+	unsigned sz, resp_sz;
+	unsigned ureq_sz_aligned;
+
+	cached = NULL;
+	ureq = cinfo[i].ureq;
+	ureq_sz_aligned = (char*)ureq_response(ureq) - (char*)ureq;
+
+	sz = full_read(pfd[i].fd, &sz_and_found, sizeof(sz_and_found));
+	if (sz < 8) {
+		/* worker was killed? */
+		log(L_DEBUG, "worker gave short reply:%u < 8", sz);
+		goto err;
+	}
+
+	resp_sz = sz_and_found.version_or_size;
+	if (resp_sz < sz || resp_sz > 0x0fffffff) { /* 256 mb */
+		error("BUG: bad size from worker:%u", resp_sz);
+		goto err;
+	}
+
+	/* Create new block of cached info */
+	cached = xzalloc(ureq_sz_aligned + resp_sz);
+	log(L_DEBUG2, "xzalloc(%u):%p sz:%u resp_sz:%u found:%u",
+			ureq_sz_aligned + resp_sz, cached,
+			sz, resp_sz,
+			(int)sz_and_found.found
+			);
+	resp = (void*) (((char*) cached) + ureq_sz_aligned);
+	memcpy(cached, ureq, ureq_size(ureq));
+	memcpy(resp, &sz_and_found, sz);
+	if (sz_and_found.found && resp_sz > sz) {
+		/* We need to read data only if it's found
+		 * (otherwise worker sends only 8 bytes).
+		 *
+		 * Replies can be big (getgrnam("guest") on a big user db),
+		 * we cannot rely on them being atomic. However, we know
+		 * that worker _always_ gives reply in one full_write(),
+		 * so we loop and read it all
+		 * (looping is implemented inside full_read())
+		 */
+		if (full_read(pfd[i].fd, ((char*) resp) + sz, resp_sz - sz) != resp_sz - sz) {
+			/* worker was killed? */
+			log(L_DEBUG, "worker gave short reply, free(%p)", cached);
+ err:
+			free(cached);
+			cached = NULL;
+			goto wo;
+		}
+	}
+	set_cache_timestamp(cached);
+	hex_dump(resp, resp_sz);
+
+ wo:
+	close(pfd[i].fd);
+
+	/* Save in cache */
+	unsigned ref = 0;
+	user_req **cache_pp = cinfo[i].cache_pp;
+	if (cache_pp != NULL) { /* if not a fake entry */
+		ureq = *cache_pp;
+		*cache_pp = cached;
+		if (CACHE_SHARED(ureq)) {
+			/* Other clients wait for this response too,
+			 * wake them (and us) up and set refcount = no_of_clients */
+			unsigned j;
+
+			for (j = 2; j < num_clients; j++) {
+				if (cinfo[j].cache_pp == cache_pp) {
+					/* This client uses the same cache entry */
+					ref++;
+					/* prevent future matches with anything */
+					cinfo[j].cache_pp = (void *) 1;
+					prepare_for_writeout(j, cached);
+				}
+			}
+			goto ret;
+		}
+		/* prevent future matches with anything */
+		cinfo[i].cache_pp = (void *) 1;
+		ref = 1;
+	}
+
+	prepare_for_writeout(i, cached);
+ret:
+	/* cache shouldn't free it under us! */
+	if (cached)
+		cached->refcount = ref;
+	aging_interval_ms = min_aging_interval_ms;
+}
+
+static void main_loop(void)
+{
+	/* 1/2 of smallest negative TTL */
+	min_aging_interval_ms = config.nttl[0];
+	if (min_aging_interval_ms > config.nttl[1]) min_aging_interval_ms = config.nttl[1];
+	if (min_aging_interval_ms > config.nttl[2]) min_aging_interval_ms = config.nttl[2];
+	min_aging_interval_ms = (min_aging_interval_ms / 2) | 1;
+	aging_interval_ms = min_aging_interval_ms;
+
+	while (1) {
+		int i, j;
+		int r;
+
+		r = SMALL_POLL_TIMEOUT_MS;
+		if (num_clients <= 2 && !cached_cnt)
+			r = -1; /* infinite */
+		else if (num_clients < max_reqnum)
+			r = aging_interval_ms;
+#if 0 /* Debug: leak detector */
+		{
+			static unsigned long long cnt;
+			static unsigned long low_malloc = -1L;
+			static unsigned long low_sbrk = -1L;
+			void *p = malloc(540); /* should not be too small */
+			void *s = sbrk(0);
+			free(p);
+			if ((unsigned long)p < low_malloc)
+				low_malloc = (unsigned long)p;
+			if ((unsigned long)s < low_sbrk)
+				low_sbrk = (unsigned long)s;
+			log(L_INFO, "poll %llu (%d ms). clients:%u cached:%u %u/%u malloc:%p (%lu), sbrk:%p (%lu)",
+				cnt, r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt,
+				p, (unsigned long)p - low_malloc,
+				s, (unsigned long)s - low_sbrk);
+			cnt++;
+		}
+#else
+		log(L_DEBUG, "poll %d ms. clients:%u cached:%u hit ratio:%u/%u",
+				r, num_clients, cached_cnt, cache_hit_cnt, cache_access_cnt);
+#endif
+
+		r = poll(pfd, num_clients, r);
+		log(L_DEBUG2, "poll returns %d", r);
+		if (r < 0) {
+			if (errno != EINTR)
+				perror_and_die("poll");
+			continue;
+		}
+
+		/* Everything between polls never sleeps.
+		 * There is no blocking I/O (except when we talk to worker thread
+		 * which is guaranteed to not block us for long) */
+
+		g_now_ms = monotonic_ms();
+		if (r == 0)
+			goto skip_fd_checks;
+
+		for (i = 0; i < 2; i++) {
+			int cfd;
+			if (!pfd[i].revents)
+				continue;
+			/* pfd[i].revents = 0; - not needed */
+			cfd = accept(pfd[i].fd, NULL, NULL);
+			if (cfd < 0) {
+				/* odd... poll() says we can accept but accept failed? */
+				log(L_DEBUG2, "accept failed with %s", strerror(errno));
+				continue;
+			}
+			ndelay_on(cfd);
+			close_on_exec(cfd);
+			/* x[num_clients] is next free element, taking it */
+			log(L_DEBUG2, "new client %d, fd %d", num_clients, cfd);
+			pfd[num_clients].fd = cfd;
+			pfd[num_clients].events = POLLIN;
+			/* this will make us do read() in next for() loop: */
+			pfd[num_clients].revents = POLLIN;
+			memset(&cinfo[num_clients], 0, sizeof(cinfo[num_clients]));
+			/* cinfo[num_clients].bytecnt = 0; - done */
+			cinfo[num_clients].started_ms = g_now_ms;
+			cinfo[num_clients].bufidx = alloc_buf_no();
+			cinfo[num_clients].ureq = bufno2buf(cinfo[num_clients].bufidx);
+			num_clients++;
+			if (num_clients >= max_reqnum) {
+				/* stop accepting new connects for now */
+				pfd[0].events = pfd[0].revents = 0;
+				pfd[1].events = pfd[1].revents = 0;
+			}
+		}
+		for (; i < num_clients; i++) {
+			if (!pfd[i].revents)
+				continue;
+			log(L_DEBUG2, "pfd[%d].revents:0x%x", i, pfd[i].revents);
+			/* pfd[i].revents = 0; - not needed */
+
+			/* "Write out result" case */
+			if (pfd[i].revents == POLLOUT) {
+				response_header *resp;
+				uint32_t resp_sz;
+				if (!cinfo[i].resptr) {
+					/* corner case: worker gave bad response earlier */
+					close_client(i);
+					continue;
+				}
+ write_out:
+				resp = ureq_response(cinfo[i].resptr);
+				resp_sz = resp->version_or_size;
+				resp->version_or_size = NSCD_VERSION;
+				errno = 0;
+				r = safe_write(pfd[i].fd, ((char*) resp) + cinfo[i].respos, resp_sz - cinfo[i].respos);
+				resp->version_or_size = resp_sz;
+
+				if (r < 0 && errno == EAGAIN)
+					continue;
+				if (r <= 0) { /* client isn't there anymore */
+					log(L_DEBUG, "client %d is gone (write returned:%d err:%s)",
+							i, r, errno ? strerror(errno) : "-");
+ write_out_is_done:
+					if (cinfo[i].cache_pp == NULL) {
+						log(L_DEBUG, "client %d: freeing fake cache entry %p", i, cinfo[i].resptr);
+						free(cinfo[i].resptr);
+					} else {
+						/* Most of the time, it is not freed here,
+						 * only refcounted--. Freeing happens
+						 * if it was deleted from cache[] but retained
+						 * for writeout. */
+						free_refcounted_ureq(&cinfo[i].resptr);
+					}
+					close_client(i);
+					continue;
+				}
+				cinfo[i].respos += r;
+				if (cinfo[i].respos >= resp_sz) {
+					/* We wrote everything */
+					/* No point in trying to get next request, it won't come.
+					 * glibc 2.4 client closes its end after each request,
+					 * without testing for EOF from server. strace:
+					 * ...
+					 * read(3, "www.google.com\0\0", 16) = 16
+					 * close(3) = 0
+					 */
+					log(L_DEBUG, "client %u: sent answer %u bytes", i, cinfo[i].respos);
+					goto write_out_is_done;
+				}
+			}
+
+			/* "Read reply from worker" case. Worker may be
+			 * already dead, revents may contain other bits too */
+			if ((pfd[i].revents & POLLIN) && cinfo[i].client_fd) {
+				log(L_DEBUG, "reading response for client %u", i);
+				handle_worker_response(i);
+				/* We can immediately try to write a response
+				 * to client */
+				goto write_out;
+			}
+
+			/* POLLHUP means pfd[i].fd is closed by peer.
+			 * POLLHUP+POLLOUT is seen when we switch for writeout
+			 * and see that pfd[i].fd is closed by peer. */
+			if ((pfd[i].revents & ~POLLOUT) == POLLHUP) {
+				int is_client = (cinfo[i].client_fd == 0 || cinfo[i].client_fd == pfd[i].fd);
+				log(L_INFO, "%s %u disappeared (got POLLHUP on fd %d)",
+					is_client ? "client" : "worker",
+					i,
+					pfd[i].fd
+				);
+				if (is_client)
+					close_client(i);
+				else {
+					/* Read worker output anyway, error handling
+					 * in that function deals with short read.
+					 * Simply closing client is wrong: it leaks
+					 * shared future entries. */
+					handle_worker_response(i);
+				}
+				continue;
+			}
+
+			/* All strange and unexpected cases */
+			if (pfd[i].revents != POLLIN) {
+				/* Not just "can read", but some other bits are there */
+				log(L_INFO, "client %u revents is strange:%x", i, pfd[i].revents);
+				close_client(i);
+				continue;
+			}
+
+			/* "Read request from client" case */
+			r = safe_read(pfd[i].fd, (char*)(cinfo[i].ureq) + cinfo[i].bytecnt, MAX_USER_REQ_SIZE - cinfo[i].bytecnt);
+			if (r < 0) {
+				log(L_DEBUG2, "error reading from client: %s", strerror(errno));
+				if (errno == EAGAIN)
+					continue;
+				close_client(i);
+				continue;
+			}
+			if (r == 0) {
+				log(L_INFO, "premature EOF from client, dropping");
+				close_client(i);
+				continue;
+			}
+			cinfo[i].bytecnt += r;
+			if (cinfo[i].bytecnt >= sizeof(user_req_header)) {
+				if (handle_client(i)) {
+					/* Response is found in cache! */
+					goto write_out;
+				}
+			}
+		} /* for each client[2..num_clients-1] */
+
+ skip_fd_checks:
+		/* Age cache */
+		if ((g_now_ms - last_age_time) >= aging_interval_ms) {
+			last_age_time = g_now_ms;
+			age_cache(/*free_all:*/ 0, -1);
+		}
+
+		/* Close timed out client connections */
+		for (i = 2; i < num_clients; i++) {
+			if (pfd[i].fd != 0 /* not closed yet? */ ////
+			 && cinfo[i].client_fd == 0 /* do we still wait for client, not worker? */
+			 && (g_now_ms - cinfo[i].started_ms) > CLIENT_TIMEOUT_MS
+			) {
+				log(L_INFO, "timed out waiting for client %u (%u ms), dropping",
+					i, (unsigned)(g_now_ms - cinfo[i].started_ms));
+				close_client(i);
+			}
+		}
+
+		if (!cnt_closed)
+			continue;
+
+		/* We closed at least one client, coalesce pfd[], cinfo[] */
+		if (min_closed + cnt_closed >= num_clients) {
+			/* clients [min_closed..num_clients-1] are all closed */
+			/* log(L_DEBUG, "taking shortcut"); - almost always happens */
+			goto coalesce_done;
+		}
+		j = min_closed;
+		i = min_closed + 1;
+		while (i < num_clients) {
+			while (1) {
+				if (pfd[i].fd)
+					break;
+				if (++i >= num_clients)
+					goto coalesce_done;
+			}
+			pfd[j] = pfd[i];
+			cinfo[j++] = cinfo[i++];
+		}
+
+ coalesce_done:
+		num_clients -= cnt_closed;
+		log(L_DEBUG, "removing %d closed clients. clients:%d", cnt_closed, num_clients);
+		min_closed = INT_MAX;
+		cnt_closed = 0;
+		/* start accepting new connects */
+		pfd[0].events = POLLIN;
+		pfd[1].events = POLLIN;
+	} /* while (1) */
+}
+
+
+/*
+** Initialization
+*/
+
+#define NSCD_PIDFILE    "/var/run/nscd/nscd.pid"
+#define NSCD_DIR        "/var/run/nscd"
+#define NSCD_SOCKET     "/var/run/nscd/socket"
+#define NSCD_SOCKET_OLD "/var/run/.nscd_socket"
+
+static smallint wrote_pidfile;
+
+static void cleanup_on_signal(int sig)
+{
+	if (wrote_pidfile)
+		unlink(NSCD_PIDFILE);
+	unlink(NSCD_SOCKET_OLD);
+	unlink(NSCD_SOCKET);
+	exit(0);
+}
+
+static void write_pid(void)
+{
+	FILE *pid = fopen(NSCD_PIDFILE, "w");
+	if (!pid)
+		return;
+	fprintf(pid, "%d\n", getpid());
+	fclose(pid);
+	wrote_pidfile = 1;
+}
+
+/* Open a listening nscd server socket */
+static int open_socket(const char *name)
+{
+	struct sockaddr_un sun;
+	int sock = socket(AF_UNIX, SOCK_STREAM, 0);
+	if (sock < 0)
+		perror_and_die("cannot create unix domain socket");
+	ndelay_on(sock);
+	close_on_exec(sock);
+	sun.sun_family = AF_UNIX;
+	strcpy(sun.sun_path, name);
+	unlink(name);
+	if (bind(sock, (struct sockaddr *) &sun, sizeof(sun)) < 0)
+		perror_and_die("bind(%s)", name);
+	if (chmod(name, 0666) < 0)
+		perror_and_die("chmod(%s)", name);
+	if (listen(sock, (max_reqnum/8) | 1) < 0)
+		perror_and_die("listen");
+	return sock;
+}
+
+static const struct option longopt[] = {
+	/* name, has_arg, int *flag, int val */
+	{ "debug"      , no_argument      , NULL, 'd' },
+	{ "config-file", required_argument, NULL, 'f' },
+	{ "invalidate" , required_argument, NULL, 'i' },
+	{ "shutdown"   , no_argument      , NULL, 'K' },
+	{ "nthreads"   , required_argument, NULL, 't' },
+	{ "version"    , no_argument      , NULL, 'V' },
+	{ "help"       , no_argument      , NULL, '?' },
+	{ "usage"      , no_argument      , NULL, '?' },
+	/* just exit(0). TODO: "test" connect? */
+	{ "statistic"  , no_argument      , NULL, 'g' },
+	{ "secure"     , no_argument      , NULL, 'S' }, /* ? */
+	{ }
+};
+
+static const char *const help[] = {
+	"Do not daemonize; log to stderr (-dd: more verbosity)",
+	"File to read configuration from",
+	"Invalidate cache",
+	"Shut the server down",
+	"Serve N requests in parallel",
+	"Version",
+};
+
+static void print_help_and_die(void)
+{
+	const struct option *opt = longopt;
+	const char *const *h = help;
+
+	puts("Usage: nscd [OPTION...]\n"
+	     "Name Service Cache Daemon\n");
+	do {
+		printf("\t" "-%c,--%-11s %s\n", opt->val, opt->name, *h);
+		h++;
+		opt++;
+	} while (opt->val != '?');
+	exit(1);
+}
+
+static char *skip_service(int *srv, const char *s)
+{
+	if (strcmp("passwd", s) == 0) {
+		*srv = SRV_PASSWD;
+		s++;
+	} else if (strcmp("group", s) == 0) {
+		*srv = SRV_GROUP;
+	} else if (strcmp("hosts", s) == 0) {
+		*srv = SRV_HOSTS;
+	} else {
+		return NULL;
+	}
+	return skip_whitespace(s + 6);
+}
+
+static void handle_null(const char *str, int srv) {}
+
+static void handle_logfile(const char *str, int srv)
+{
+	config.logfile = xstrdup(str);
+}
+
+static void handle_debuglvl(const char *str, int srv)
+{
+	debug |= (uint8_t) getnum(str);
+}
+
+static void handle_threads(const char *str, int srv)
+{
+	unsigned n = getnum(str);
+	if (max_reqnum < n)
+		max_reqnum = n;
+}
+
+static void handle_user(const char *str, int srv)
+{
+	config.user = xstrdup(str);
+}
+
+static void handle_enable(const char *str, int srv)
+{
+	config.srv_enable[srv] = ((str[0] | 0x20) == 'y');
+}
+
+static void handle_pttl(const char *str, int srv)
+{
+	config.pttl[srv] = getnum(str);
+}
+
+static void handle_nttl(const char *str, int srv)
+{
+	config.nttl[srv] = getnum(str);
+}
+
+static void handle_size(const char *str, int srv)
+{
+	config.size[srv] = getnum(str);
+}
+
+static void handle_chfiles(const char *str, int srv)
+{
+	config.check_files[srv] = ((str[0] | 0x20) == 'y');
+}
+
+static void parse_conffile(const char *conffile, int warn)
+{
+	static const struct confword {
+		const char *str;
+		void (*handler)(const char *, int);
+	} conf_words[] = {
+		{ "_" "logfile"               , handle_logfile  },
+		{ "_" "debug-level"           , handle_debuglvl },
+		{ "_" "threads"               , handle_threads  },
+		{ "_" "max-threads"           , handle_threads  },
+		{ "_" "server-user"           , handle_user     },
+		/* ignore: any user can stat */
+		{ "_" "stat-user"             , handle_null     },
+		{ "_" "paranoia"              , handle_null     }, /* ? */
+		/* ignore: design goal is to never crash/hang */
+		{ "_" "reload-count"          , handle_null     },
+		{ "_" "restart-interval"      , handle_null     },
+		{ "S" "enable-cache"          , handle_enable   },
+		{ "S" "positive-time-to-live" , handle_pttl     },
+		{ "S" "negative-time-to-live" , handle_nttl     },
+		{ "S" "suggested-size"        , handle_size     },
+		{ "S" "check-files"           , handle_chfiles  },
+		{ "S" "persistent"            , handle_null     }, /* ? */
+		{ "S" "shared"                , handle_null     }, /* ? */
+		{ "S" "auto-propagate"        , handle_null     }, /* ? */
+		{ }
+	};
+
+	char buf[128];
+	FILE *file = fopen(conffile, "r");
+	int lineno = 0;
+
+	if (!file) {
+		if (conffile != default_conffile)
+			perror_and_die("cannot open %s", conffile);
+		return;
+	}
+
+	while (fgets(buf, sizeof(buf), file) != NULL) {
+		const struct confword *word;
+		char *p;
+		int len = strlen(buf);
+
+		lineno++;
+		if (len) {
+			if (buf[len-1] != '\n') {
+				if (len >= sizeof(buf) - 1)
+					error_and_die("%s:%d: line is too long", conffile, lineno);
+				len++; /* last line, not terminated by '\n' */
+			}
+			buf[len-1] = '\0';
+		}
+		p = strchr(buf, '#');
+		if (p)
+			*p = '\0';
+
+		p = skip_whitespace(buf);
+		if (!*p)
+			continue;
+		*skip_non_whitespace(p) = '\0';
+		word = conf_words;
+		while (1) {
+			if (strcmp(word->str + 1, p) == 0) {
+				int srv = 0;
+				p = skip_whitespace(p + strlen(p) + 1);
+				*skip_non_whitespace(p) = '\0';
+				if (word->str[0] == 'S') {
+					char *p2 = skip_service(&srv, p);
+					if (!p2) {
+						if (warn)
+							error("%s:%d: ignoring unknown service name '%s'", conffile, lineno, p);
+						break;
+					}
+					p = p2;
+					*skip_non_whitespace(p) = '\0';
+				}
+				word->handler(p, srv);
+				break;
+			}
+			word++;
+			if (!word->str) {
+				if (warn)
+					error("%s:%d: ignoring unknown directive '%s'", conffile, lineno, p);
+				break;
+			}
+		}
+	}
+	fclose(file);
+}
+
+
+/* "XX,XX[,XX]..." -> gid_t[] */
+static gid_t* env_U_to_uid_and_gids(const char *str, int *sizep)
+{
+	const char *sp;
+	gid_t *ug, *gp;
+	int ng;
+
+	sp = str;
+	ng = 1;
+	while (*sp)
+		if (*sp++ == ',')
+			ng++;
+	ug = xmalloc(ng * sizeof(ug[0]));
+
+	ng = 0;
+	gp = ug;
+	sp = str;
+	errno = 0;
+	while (1) {
+		ng++;
+		*gp++ = strtoul(sp, (char**)&sp, 16);
+		if (errno || (*sp != ',' && *sp != '\0'))
+			error_and_die("internal error");
+		if (*sp == '\0')
+			break;
+		sp++;
+	}
+
+	*sizep = ng;
+	return ug;
+}
+
+
+static char* user_to_env_U(const char *user)
+{
+	int ng;
+	char *ug_str, *sp;
+	gid_t *ug, *gp;
+	struct passwd *pw;
+
+	pw = getpwnam(user);
+	if (!pw)
+		perror_and_die("user '%s' is not known", user);
+
+	ng = 64;
+	/* 0th cell will be used for uid */
+	ug = xmalloc((1 + ng) * sizeof(ug[0]));
+	if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0) {
+		ug = xrealloc(ug, (1 + ng) * sizeof(ug[0]));
+		if (getgrouplist(user, pw->pw_gid, &ug[1], &ng) < 0)
+			perror_and_die("can't get groups of user '%s'", user);
+	}
+	ng++;
+	ug[0] = pw->pw_uid;
+
+	/* How much do we need for "-Uxx,xx[,xx]..." string? */
+	ug_str = xmalloc((sizeof(unsigned long)+1)*2 * ng + 3);
+	gp = ug;
+	sp = ug_str;
+	*sp++ = 'U';
+	*sp++ = '=';
+	do {
+		sp += sprintf(sp, "%lx,", (unsigned long)(*gp++));
+	} while (--ng);
+	sp[-1] = '\0';
+
+	free(ug);
+	return ug_str;
+}
+
+
+/* not static - don't inline me, compiler! */
+void readlink_self_exe(void);
+void readlink_self_exe(void)
+{
+	char buf[PATH_MAX + 1];
+	ssize_t sz = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+	if (sz < 0)
+		perror_and_die("readlink %s failed", "/proc/self/exe");
+	buf[sz] = 0;
+	self_exe_points_to = xstrdup(buf);
+}
+
+
+static void special_op(const char *arg) NORETURN;
+static void special_op(const char *arg)
+{
+	static const user_req_header ureq = { NSCD_VERSION, SHUTDOWN, 0 };
+
+	struct sockaddr_un addr;
+	int sock;
+
+	sock = socket(PF_UNIX, SOCK_STREAM, 0);
+	if (sock < 0)
+		error_and_die("cannot create AF_UNIX socket");
+
+	addr.sun_family = AF_UNIX;
+	strcpy(addr.sun_path, NSCD_SOCKET);
+	if (connect(sock, (struct sockaddr *) &addr, sizeof(addr)) < 0)
+		error_and_die("cannot connect to %s", NSCD_SOCKET);
+
+	if (!arg) { /* shutdown */
+		xfull_write(sock, &ureq, sizeof(ureq));
+		printf("sent shutdown request, exiting\n");
+	} else { /* invalidate */
+		size_t arg_len = strlen(arg) + 1;
+		struct {
+			user_req_header req;
+			char arg[arg_len];
+		} reqdata;
+		reqdata.req.version = NSCD_VERSION;
+		reqdata.req.type = INVALIDATE;
+		reqdata.req.key_len = arg_len;
+		memcpy(reqdata.arg, arg, arg_len);
+		xfull_write(sock, &reqdata, arg_len + sizeof(ureq));
+		printf("sent invalidate(%s) request, exiting\n", arg);
+	}
+	exit(0);
+}
+
+
+/* Callback for glibc-2.15 */
+struct traced_file;
+static void do_nothing(size_t dbidx, struct traced_file *finfo)
+{
+	/* nscd from glibc-2.15 does something like this:
+	if (!dbs[dbidx].enabled || !dbs[dbidx].check_file)
+		return;
+	add_file_to_watch_list(finfo->fname);
+	*/
+}
+
+/* This internal glibc function is called to disable trying to contact nscd.
+ * We _are_ nscd, so we need to do the lookups, and not recurse.
+ * Until 2.14, this function was taking no parameters.
+ * In 2.15, it takes a function pointer from hell.
+ */
+void __nss_disable_nscd(void (*hell)(size_t, struct traced_file*));
+
+
+int main(int argc, char **argv)
+{
+	int n;
+	unsigned opt_d_cnt;
+	const char *env_U;
+	const char *conffile;
+
+	/* make sure we don't get recursive calls */
+	__nss_disable_nscd(do_nothing);
+
+	if (argv[0][0] == 'w') /* "worker_nscd" */
+		worker(argv[1]);
+
+	setlinebuf(stdout);
+	setlinebuf(stderr);
+
+	/* Make sure stdio is not closed */
+	n = xopen3("/dev/null", O_RDWR, 0);
+	while (n < 2)
+		n = dup(n);
+	/* Close unexpected open file descriptors */
+	n |= 0xff; /* start from at least fd# 255 */
+	do {
+		close(n--);
+	} while (n > 2);
+
+	/* For idiotic kernels which disallow "exec /proc/self/exe" */
+	readlink_self_exe();
+
+	conffile = default_conffile;
+	opt_d_cnt = 0;
+	while ((n = getopt_long(argc, argv, "df:i:KVgt:", longopt, NULL)) != -1) {
+		switch (n) {
+		case 'd':
+			opt_d_cnt++;
+			debug &= ~D_DAEMON;
+			break;
+		case 'f':
+			conffile = optarg;
+			break;
+		case 'i':
+			/* invalidate */
+			special_op(optarg); /* exits */
+		case 'K':
+			/* shutdown server */
+			special_op(NULL); /* exits */
+		case 'V':
+			puts("unscd - nscd which does not hang, v."PROGRAM_VERSION);
+			exit(0);
+		case 'g':
+			exit(0);
+		case 't':
+			/* N threads */
+			max_reqnum = getnum(optarg);
+			break;
+		case 'S':
+			/* secure (?) */
+			break;
+		default:
+			print_help_and_die();
+		}
+	}
+	/* Multiple -d can bump debug regardless of nscd.conf:
+	 * no -d or -d: 0, -dd: 1,
+	 * -ddd: 3, -dddd: 7, -ddddd: 15
+	 */
+	if (opt_d_cnt != 0)
+		debug |= (((1U << opt_d_cnt) >> 1) - 1) & L_ALL;
+
+	env_U = getenv("U");
+	/* Avoid duplicate warnings if $U exists */
+	parse_conffile(conffile, /* warn? */ (env_U == NULL));
+
+	/* I have a user report of (broken?) ldap nss library
+	 * opening and never closing a socket to a ldap server,
+	 * even across fork() and exec(). This messes up
+	 * worker child's operations for the reporter.
+	 *
+	 * This strenghtens my belief that nscd _must not_ trust
+	 * nss libs to be written correctly.
+	 *
+	 * Here, we need to jump through the hoops to guard against
+	 * such problems. If config file has server-user setting, we need
+	 * to setgroups + setuid. For that, we need to get uid and gid vector.
+	 * And that means possibly using buggy nss libs.
+	 * We will do it here, but then we will re-exec, passing uid+gids
+	 * in an environment variable.
+	 */
+	if (!env_U && config.user) {
+		/* user_to_env_U() does getpwnam and getgrouplist */
+		if (putenv(user_to_env_U(config.user)))
+			error_and_die("out of memory");
+		/* fds leaked by nss will be closed by execed copy */
+		execv(self_exe_points_to, argv);
+		xexecve("/proc/self/exe", argv, environ);
+	}
+
+	/* Allocate dynamically sized stuff */
+	max_reqnum += 2; /* account for 2 first "fake" clients */
+	if (max_reqnum < 8) max_reqnum = 8; /* sanitize */
+	/* Since refcount is a byte, can't serve more than 255-2 clients
+	 * at once. The rest will block in connect() */
+	if (max_reqnum > 0xff) max_reqnum = 0xff;
+	client_buf = xzalloc(max_reqnum * sizeof(client_buf[0]));
+	busy_cbuf  = xzalloc(max_reqnum * sizeof(busy_cbuf[0]));
+	pfd        = xzalloc(max_reqnum * sizeof(pfd[0]));
+	cinfo      = xzalloc(max_reqnum * sizeof(cinfo[0]));
+
+	cache_size = (config.size[0] + config.size[1] + config.size[2]) / 8;
+	if (cache_size < 8) cache_size = 8; /* 8*8 = 64 entries min */
+	if (cache_size > 0xffff) cache_size = 0xffff; /* 8*64k entries max */
+	cache_size |= 1; /* force it to be odd */
+	cache = xzalloc(cache_size * sizeof(cache[0]));
+
+	/* Register cleanup hooks */
+	signal(SIGINT, cleanup_on_signal);
+	signal(SIGTERM, cleanup_on_signal);
+	/* Don't die if a client closes a socket on us */
+	signal(SIGPIPE, SIG_IGN);
+	/* Avoid creating zombies */
+	signal(SIGCHLD, SIG_IGN);
+#if !DEBUG_BUILD
+	/* Ensure workers don't have SIGALRM ignored */
+	signal(SIGALRM, SIG_DFL);
+#endif
+
+	if (mkdir(NSCD_DIR, 0755) == 0) {
+		/* prevent bad mode of NSCD_DIR if umask is e.g. 077 */
+		chmod(NSCD_DIR, 0755);
+	}
+	pfd[0].fd = open_socket(NSCD_SOCKET);
+	pfd[1].fd = open_socket(NSCD_SOCKET_OLD);
+	pfd[0].events = POLLIN;
+	pfd[1].events = POLLIN;
+
+	if (debug & D_DAEMON) {
+		daemon(/*nochdir*/ 1, /*noclose*/ 0);
+		if (config.logfile) {
+			/* nochdir=1: relative paths still work as expected */
+			xmovefd(xopen3(config.logfile, O_WRONLY|O_CREAT|O_TRUNC, 0666), 2);
+			debug |= D_STAMP;
+		} else {
+			debug = 0; /* why bother? it's /dev/null'ed anyway */
+		}
+		chdir("/"); /* compat */
+		write_pid();
+		setsid();
+		/* ignore job control signals */
+		signal(SIGTTOU, SIG_IGN);
+		signal(SIGTTIN, SIG_IGN);
+		signal(SIGTSTP, SIG_IGN);
+	}
+
+	log(L_ALL, "unscd v" PROGRAM_VERSION ", debug level 0x%x", debug & L_ALL);
+	log(L_DEBUG, "max %u requests in parallel", max_reqnum - 2);
+	log(L_DEBUG, "cache size %u x 8 entries", cache_size);
+
+	if (env_U) {
+		int size;
+		gid_t *ug = env_U_to_uid_and_gids(env_U, &size);
+		if (size > 1)
+			if (setgroups(size - 1, &ug[1]) || setgid(ug[1]))
+				perror_and_die("cannot set groups for user '%s'", config.user);
+		if (size > 0)
+			if (setuid(ug[0]))
+				perror_and_die("cannot set uid to %u", (unsigned)(ug[0]));
+		free(ug);
+	}
+
+	for (n = 0; n < 3; n++) {
+		log(L_DEBUG, "%s cache enabled:%u pttl:%u nttl:%u",
+				srv_name[n],
+				config.srv_enable[n],
+				config.pttl[n],
+				config.nttl[n]);
+		config.pttl[n] *= 1000;
+		config.nttl[n] *= 1000;
+	}
+
+	main_loop();
+
+	return 0;
+}
-- 
2.39.2