3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* Probably I will not do socket programming in the next few years and
29 therefore I decide to heavily annotate this file, for Linux and
30 Windows as well. -lh3 */
38 #include <sys/types.h>
44 #include <arpa/inet.h>
45 #include <sys/socket.h>
50 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
51 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
52 * integer -1. In knetfile.c, I use "int" for socket type
53 * throughout. This should be improved to avoid confusion.
55 * In Linux/Mac, recv() and read() do almost the same thing. You can see
56 * in the header file that netread() is simply an alias of read(). In
57 * Windows, however, they are different and using recv() is mandatory.
60 /* This function tests if the file handler is ready for reading (or
61 * writing if is_read==0). */
62 static int socket_wait(int fd, int is_read)
64 fd_set fds, *fdr = 0, *fdw = 0;
67 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
70 if (is_read) fdr = &fds;
72 ret = select(fd+1, fdr, fdw, 0, &tv);
73 if (ret == -1) perror("select");
78 /* This function does not work with Windows due to the lack of
79 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
80 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
81 static int socket_connect(const char *host, const char *port)
83 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
86 struct linger lng = { 0, 0 };
87 struct addrinfo hints, *res;
88 memset(&hints, 0, sizeof(struct addrinfo));
89 hints.ai_family = AF_UNSPEC;
90 hints.ai_socktype = SOCK_STREAM;
91 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
92 * server information. */
93 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
94 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
95 /* The following two setsockopt() are used by ftplib
96 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
98 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
99 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
100 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
105 /* MinGW's printf has problem with "%lld" */
106 char *uint64tostr(char *buf, uint64_t x)
109 for (i = 0; x; x /= 10) buf[i++] = '0' + x%10;
111 for (cnt = i, i = 0; i < cnt/2; ++i) {
112 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
116 /* In windows, the first thing is to establish the TCP connection. */
117 int knet_win32_init()
120 return WSAStartup(MAKEWORD(2, 2), &wsaData);
122 void knet_win32_destroy()
126 /* A slightly modfied version of the following function also works on
127 * Mac (and presummably Linux). However, this function is not stable on
128 * my Mac. It sometimes works fine but sometimes does not. Therefore for
129 * non-Windows OS, I do not use this one. */
130 static SOCKET socket_connect(const char *host, const char *port)
132 #define __err_connect(func) do { perror(func); return -1; } while (0)
136 struct linger lng = { 0, 0 };
137 struct sockaddr_in server;
138 struct hostent *hp = 0;
140 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
141 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
142 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
144 if (isalpha(host[0])) hp = gethostbyname(host);
147 addr.s_addr = inet_addr(host);
148 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
150 if (hp == 0) __err_connect("gethost");
152 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
153 server.sin_family= AF_INET;
154 server.sin_port = htons(atoi(port));
155 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
156 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
161 static off_t my_netread(int fd, void *buf, off_t len)
163 off_t rest = len, curr, l = 0;
164 /* recv() and read() may not read the required length of data with
165 * one call. They have to be called repeatedly. */
167 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
168 curr = netread(fd, buf + l, rest);
169 /* According to the glibc manual, section 13.2, a zero returned
170 * value indicates end-of-file (EOF), which should mean that
171 * read() will not return zero if EOF has not been met but data
172 * are not immediately available. */
173 if (curr == 0) break;
174 l += curr; rest -= curr;
179 /*************************
180 * FTP specific routines *
181 *************************/
183 static int kftp_get_response(knetFile *ftp)
188 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
189 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
191 if (n >= ftp->max_response) {
192 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
193 ftp->response = realloc(ftp->response, ftp->max_response);
195 ftp->response[n++] = c;
197 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
198 && ftp->response[3] != '-') break;
203 if (n < 2) return -1;
204 ftp->response[n-2] = 0;
205 return strtol(ftp->response, &p, 0);
208 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
210 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
211 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
212 return is_get? kftp_get_response(ftp) : 0;
215 static int kftp_pasv_prep(knetFile *ftp)
219 kftp_send_cmd(ftp, "PASV\r\n", 1);
220 for (p = ftp->response; *p && *p != '('; ++p);
221 if (*p != '(') return -1;
223 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
224 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
225 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
230 static int kftp_pasv_connect(knetFile *ftp)
232 char host[80], port[10];
233 if (ftp->pasv_port == 0) {
234 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
237 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
238 sprintf(port, "%d", ftp->pasv_port);
239 ftp->fd = socket_connect(host, port);
240 if (ftp->fd == -1) return -1;
244 int kftp_connect(knetFile *ftp)
246 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
247 if (ftp->ctrl_fd == -1) return -1;
248 kftp_get_response(ftp);
249 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
250 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
251 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
255 int kftp_reconnect(knetFile *ftp)
257 if (ftp->ctrl_fd != -1) {
258 netclose(ftp->ctrl_fd);
262 return kftp_connect(ftp);
265 // initialize ->type, ->host and ->retr
266 knetFile *kftp_parse_url(const char *fn, const char *mode)
271 if (strstr(fn, "ftp://") != fn) return 0;
272 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
273 if (*p != '/') return 0;
275 fp = calloc(1, sizeof(knetFile));
276 fp->type = KNF_TYPE_FTP;
278 /* the Linux/Mac version of socket_connect() also recognizes a port
279 * like "ftp", but the Windows version does not. */
280 fp->port = strdup("21");
281 fp->host = calloc(l + 1, 1);
282 if (strchr(mode, 'c')) fp->no_reconnect = 1;
283 strncpy(fp->host, fn + 6, l);
284 fp->retr = calloc(strlen(p) + 8, 1);
285 sprintf(fp->retr, "RETR %s\r\n", p);
286 fp->seek_offset = -1;
289 // place ->fd at offset off
290 int kftp_connect_file(knetFile *fp)
295 if (fp->no_reconnect) kftp_get_response(fp);
301 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
303 strcpy(tmp, "REST ");
304 uint64tostr(tmp + 5, fp->offset);
307 kftp_send_cmd(fp, tmp, 1);
309 kftp_send_cmd(fp, fp->retr, 0);
310 kftp_pasv_connect(fp);
311 ret = kftp_get_response(fp);
313 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
322 /**************************
323 * HTTP specific routines *
324 **************************/
326 knetFile *khttp_parse_url(const char *fn, const char *mode)
331 if (strstr(fn, "http://") != fn) return 0;
333 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
335 fp = calloc(1, sizeof(knetFile));
336 fp->http_host = calloc(l + 1, 1);
337 strncpy(fp->http_host, fn + 7, l);
338 fp->http_host[l] = 0;
339 for (q = fp->http_host; *q && *q != ':'; ++q);
340 if (*q == ':') *q++ = 0;
342 proxy = getenv("http_proxy");
343 // set ->host, ->port and ->path
345 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
346 fp->port = strdup(*q? q : "80");
347 fp->path = strdup(*p? p : "/");
349 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
350 for (q = fp->host; *q && *q != ':'; ++q);
351 if (*q == ':') *q++ = 0;
352 fp->port = strdup(*q? q : "80");
353 fp->path = strdup(fn);
355 fp->type = KNF_TYPE_HTTP;
356 fp->ctrl_fd = fp->fd = -1;
357 fp->seek_offset = -1;
361 int khttp_connect_file(knetFile *fp)
365 if (fp->fd != -1) netclose(fp->fd);
366 fp->fd = socket_connect(fp->host, fp->port);
367 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
368 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
370 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
371 l += sprintf(buf + l, "\r\n");
372 netwrite(fp->fd, buf, l);
374 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
375 if (buf[l] == '\n' && l >= 3)
376 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
380 if (l < 14) { // prematured header
385 ret = strtol(buf + 8, &p, 0); // HTTP return code
386 if (ret == 200 && fp->offset) { // 200 (complete result); then skip beginning of the file
387 off_t rest = fp->offset;
389 off_t l = rest < 0x10000? rest : 0x10000;
390 rest -= my_netread(fp->fd, buf, l);
392 } else if (ret != 206 && ret != 200) {
394 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
404 /********************
406 ********************/
408 knetFile *knet_open(const char *fn, const char *mode)
411 if (mode[0] != 'r') {
412 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
415 if (strstr(fn, "ftp://") == fn) {
416 fp = kftp_parse_url(fn, mode);
417 if (fp == 0) return 0;
418 if (kftp_connect(fp) == -1) {
422 kftp_connect_file(fp);
423 } else if (strstr(fn, "http://") == fn) {
424 fp = khttp_parse_url(fn, mode);
425 if (fp == 0) return 0;
426 khttp_connect_file(fp);
427 } else { // local file
429 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
430 * be undefined on some systems, although it is defined on my
431 * Mac and the Linux I have tested on. */
432 int fd = open(fn, O_RDONLY | O_BINARY);
434 int fd = open(fn, O_RDONLY);
440 fp = (knetFile*)calloc(1, sizeof(knetFile));
441 fp->type = KNF_TYPE_LOCAL;
445 if (fp && fp->fd == -1) {
452 knetFile *knet_dopen(int fd, const char *mode)
454 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
455 fp->type = KNF_TYPE_LOCAL;
460 off_t knet_read(knetFile *fp, void *buf, off_t len)
463 if (fp->fd == -1) return 0;
464 if (fp->type == KNF_TYPE_FTP) {
465 if (fp->is_ready == 0) {
466 if (!fp->no_reconnect) kftp_reconnect(fp);
467 kftp_connect_file(fp);
469 } else if (fp->type == KNF_TYPE_HTTP) {
470 if (fp->is_ready == 0)
471 khttp_connect_file(fp);
473 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
474 off_t rest = len, curr;
476 curr = read(fp->fd, buf + l, rest);
477 if (curr == 0) break;
478 l += curr; rest -= curr;
480 } else l = my_netread(fp->fd, buf, len);
485 int knet_seek(knetFile *fp, off_t off, int whence)
487 if (whence == SEEK_SET && off == fp->offset) return 0;
488 if (fp->type == KNF_TYPE_LOCAL) {
489 /* Be aware that lseek() returns the offset after seeking,
490 * while fseek() returns zero on success. */
491 off_t offset = lseek(fp->fd, off, whence);
498 } else if (fp->type == KNF_TYPE_FTP || fp->type == KNF_TYPE_HTTP) {
499 if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future
500 fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP/HTTP. Offset is unchanged.\n");
510 int knet_close(knetFile *fp)
512 if (fp == 0) return 0;
513 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
515 /* On Linux/Mac, netclose() is an alias of close(), but on
516 * Windows, it is an alias of closesocket(). */
517 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
518 else netclose(fp->fd);
520 free(fp->host); free(fp->port);
521 free(fp->response); free(fp->retr); // FTP specific
522 free(fp->path); free(fp->http_host); // HTTP specific
536 buf = calloc(0x100000, 1);
538 fp = knet_open("knetfile.c", "r");
539 knet_seek(fp, 1000, SEEK_SET);
540 } else if (type == 1) { // NCBI FTP, large file
541 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
542 knet_seek(fp, 2500000000ll, SEEK_SET);
543 l = knet_read(fp, buf, 255);
544 } else if (type == 2) {
545 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
546 knet_seek(fp, 1000, SEEK_SET);
547 } else if (type == 3) {
548 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
549 knet_seek(fp, 1000, SEEK_SET);
550 } else if (type == 4) {
551 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
552 knet_read(fp, buf, 10000);
553 knet_seek(fp, 20000, SEEK_SET);
554 knet_seek(fp, 10000, SEEK_SET);
555 l = knet_read(fp, buf+10000, 10000000) + 10000;
557 if (type != 4 && type != 1) {
558 knet_read(fp, buf, 255);
561 } else write(fileno(stdout), buf, l);