3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* Probably I will not do socket programming in the next few years and
29 therefore I decide to heavily annotate this file, for Linux and
30 Windows as well. -lh3 */
39 #include <sys/types.h>
43 #include <arpa/inet.h>
44 #include <sys/socket.h>
49 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
50 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
51 * integer -1. In knetfile.c, I use "int" for socket type
52 * throughout. This should be improved to avoid confusion.
54 * In Linux/Mac, recv() and read() do almost the same thing. You can see
55 * in the header file that netread() is simply an alias of read(). In
56 * Windows, however, they are different and using recv() is mandatory.
59 /* This function tests if the file handler is ready for reading (or
60 * writing if is_read==0). */
61 static int socket_wait(int fd, int is_read)
63 fd_set fds, *fdr = 0, *fdw = 0;
66 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
69 if (is_read) fdr = &fds;
71 ret = select(fd+1, fdr, fdw, 0, &tv);
73 if (ret == -1) perror("select");
76 fprintf(stderr, "select time-out\n");
77 else if (ret == SOCKET_ERROR)
78 fprintf(stderr, "select: %d\n", WSAGetLastError());
84 /* This function does not work with Windows due to the lack of
85 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
86 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
87 static int socket_connect(const char *host, const char *port)
89 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
92 struct linger lng = { 0, 0 };
93 struct addrinfo hints, *res;
94 memset(&hints, 0, sizeof(struct addrinfo));
95 hints.ai_family = AF_UNSPEC;
96 hints.ai_socktype = SOCK_STREAM;
97 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
98 * server information. */
99 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
100 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
101 /* The following two setsockopt() are used by ftplib
102 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
104 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
105 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
106 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
111 /* MinGW's printf has problem with "%lld" */
112 char *int64tostr(char *buf, int64_t x)
117 buf[i++] = '0' + x % 10;
121 for (cnt = i, i = 0; i < cnt/2; ++i) {
122 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
127 int64_t strtoint64(const char *buf)
130 for (x = 0; *buf != '\0'; ++buf)
131 x = x * 10 + ((int64_t) *buf - 48);
134 /* In windows, the first thing is to establish the TCP connection. */
135 int knet_win32_init()
138 return WSAStartup(MAKEWORD(2, 2), &wsaData);
140 void knet_win32_destroy()
144 /* A slightly modfied version of the following function also works on
145 * Mac (and presummably Linux). However, this function is not stable on
146 * my Mac. It sometimes works fine but sometimes does not. Therefore for
147 * non-Windows OS, I do not use this one. */
148 static SOCKET socket_connect(const char *host, const char *port)
150 #define __err_connect(func) \
152 fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
158 struct linger lng = { 0, 0 };
159 struct sockaddr_in server;
160 struct hostent *hp = 0;
162 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
163 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
164 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
166 if (isalpha(host[0])) hp = gethostbyname(host);
169 addr.s_addr = inet_addr(host);
170 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
172 if (hp == 0) __err_connect("gethost");
174 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
175 server.sin_family= AF_INET;
176 server.sin_port = htons(atoi(port));
177 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
178 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
183 static off_t my_netread(int fd, void *buf, off_t len)
185 off_t rest = len, curr, l = 0;
186 /* recv() and read() may not read the required length of data with
187 * one call. They have to be called repeatedly. */
189 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
190 curr = netread(fd, buf + l, rest);
191 /* According to the glibc manual, section 13.2, a zero returned
192 * value indicates end-of-file (EOF), which should mean that
193 * read() will not return zero if EOF has not been met but data
194 * are not immediately available. */
195 if (curr == 0) break;
196 l += curr; rest -= curr;
201 /*************************
202 * FTP specific routines *
203 *************************/
205 static int kftp_get_response(knetFile *ftp)
214 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
215 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
217 if (n >= ftp->max_response) {
218 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
219 ftp->response = realloc(ftp->response, ftp->max_response);
221 ftp->response[n++] = c;
223 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
224 && ftp->response[3] != '-') break;
229 if (n < 2) return -1;
230 ftp->response[n-2] = 0;
231 return strtol(ftp->response, &p, 0);
234 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
236 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
237 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
238 return is_get? kftp_get_response(ftp) : 0;
241 static int kftp_pasv_prep(knetFile *ftp)
245 kftp_send_cmd(ftp, "PASV\r\n", 1);
246 for (p = ftp->response; *p && *p != '('; ++p);
247 if (*p != '(') return -1;
249 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
250 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
251 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
256 static int kftp_pasv_connect(knetFile *ftp)
258 char host[80], port[10];
259 if (ftp->pasv_port == 0) {
260 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
263 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
264 sprintf(port, "%d", ftp->pasv_port);
265 ftp->fd = socket_connect(host, port);
266 if (ftp->fd == -1) return -1;
270 int kftp_connect(knetFile *ftp)
272 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
273 if (ftp->ctrl_fd == -1) return -1;
274 kftp_get_response(ftp);
275 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
276 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
277 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
281 int kftp_reconnect(knetFile *ftp)
283 if (ftp->ctrl_fd != -1) {
284 netclose(ftp->ctrl_fd);
289 return kftp_connect(ftp);
292 // initialize ->type, ->host, ->retr and ->size
293 knetFile *kftp_parse_url(const char *fn, const char *mode)
298 if (strstr(fn, "ftp://") != fn) return 0;
299 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
300 if (*p != '/') return 0;
302 fp = calloc(1, sizeof(knetFile));
303 fp->type = KNF_TYPE_FTP;
305 /* the Linux/Mac version of socket_connect() also recognizes a port
306 * like "ftp", but the Windows version does not. */
307 fp->port = strdup("21");
308 fp->host = calloc(l + 1, 1);
309 if (strchr(mode, 'c')) fp->no_reconnect = 1;
310 strncpy(fp->host, fn + 6, l);
311 fp->retr = calloc(strlen(p) + 8, 1);
312 sprintf(fp->retr, "RETR %s\r\n", p);
313 fp->size_cmd = calloc(strlen(p) + 8, 1);
314 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
318 // place ->fd at offset off
319 int kftp_connect_file(knetFile *fp)
325 if (fp->no_reconnect) kftp_get_response(fp);
328 kftp_send_cmd(fp, fp->size_cmd, 1);
330 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
332 fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
336 const char *p = fp->response;
337 while (*p != ' ') ++p;
338 while (*p < '0' || *p > '9') ++p;
339 file_size = strtoint64(p);
341 fp->file_size = file_size;
345 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
347 strcpy(tmp, "REST ");
348 int64tostr(tmp + 5, fp->offset);
351 kftp_send_cmd(fp, tmp, 1);
353 kftp_send_cmd(fp, fp->retr, 0);
354 kftp_pasv_connect(fp);
355 ret = kftp_get_response(fp);
357 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
367 /**************************
368 * HTTP specific routines *
369 **************************/
371 knetFile *khttp_parse_url(const char *fn, const char *mode)
376 if (strstr(fn, "http://") != fn) return 0;
378 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
380 fp = calloc(1, sizeof(knetFile));
381 fp->http_host = calloc(l + 1, 1);
382 strncpy(fp->http_host, fn + 7, l);
383 fp->http_host[l] = 0;
384 for (q = fp->http_host; *q && *q != ':'; ++q);
385 if (*q == ':') *q++ = 0;
387 proxy = getenv("http_proxy");
388 // set ->host, ->port and ->path
390 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
391 fp->port = strdup(*q? q : "80");
392 fp->path = strdup(*p? p : "/");
394 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
395 for (q = fp->host; *q && *q != ':'; ++q);
396 if (*q == ':') *q++ = 0;
397 fp->port = strdup(*q? q : "80");
398 fp->path = strdup(fn);
400 fp->type = KNF_TYPE_HTTP;
401 fp->ctrl_fd = fp->fd = -1;
406 int khttp_connect_file(knetFile *fp)
410 if (fp->fd != -1) netclose(fp->fd);
411 fp->fd = socket_connect(fp->host, fp->port);
412 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
413 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
414 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
415 l += sprintf(buf + l, "\r\n");
416 netwrite(fp->fd, buf, l);
418 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
419 if (buf[l] == '\n' && l >= 3)
420 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
424 if (l < 14) { // prematured header
429 ret = strtol(buf + 8, &p, 0); // HTTP return code
430 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
431 off_t rest = fp->offset;
433 off_t l = rest < 0x10000? rest : 0x10000;
434 rest -= my_netread(fp->fd, buf, l);
436 } else if (ret != 206 && ret != 200) {
438 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
448 /********************
450 ********************/
452 knetFile *knet_open(const char *fn, const char *mode)
455 if (mode[0] != 'r') {
456 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
459 if (strstr(fn, "ftp://") == fn) {
460 fp = kftp_parse_url(fn, mode);
461 if (fp == 0) return 0;
462 if (kftp_connect(fp) == -1) {
466 kftp_connect_file(fp);
467 } else if (strstr(fn, "http://") == fn) {
468 fp = khttp_parse_url(fn, mode);
469 if (fp == 0) return 0;
470 khttp_connect_file(fp);
471 } else { // local file
473 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
474 * be undefined on some systems, although it is defined on my
475 * Mac and the Linux I have tested on. */
476 int fd = open(fn, O_RDONLY | O_BINARY);
478 int fd = open(fn, O_RDONLY);
484 fp = (knetFile*)calloc(1, sizeof(knetFile));
485 fp->type = KNF_TYPE_LOCAL;
489 if (fp && fp->fd == -1) {
496 knetFile *knet_dopen(int fd, const char *mode)
498 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
499 fp->type = KNF_TYPE_LOCAL;
504 off_t knet_read(knetFile *fp, void *buf, off_t len)
507 if (fp->fd == -1) return 0;
508 if (fp->type == KNF_TYPE_FTP) {
509 if (fp->is_ready == 0) {
510 if (!fp->no_reconnect) kftp_reconnect(fp);
511 kftp_connect_file(fp);
513 } else if (fp->type == KNF_TYPE_HTTP) {
514 if (fp->is_ready == 0)
515 khttp_connect_file(fp);
517 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
518 off_t rest = len, curr;
520 curr = read(fp->fd, buf + l, rest);
521 if (curr == 0) break;
522 l += curr; rest -= curr;
524 } else l = my_netread(fp->fd, buf, len);
529 off_t knet_seek(knetFile *fp, int64_t off, int whence)
531 if (whence == SEEK_SET && off == fp->offset) return 0;
532 if (fp->type == KNF_TYPE_LOCAL) {
533 /* Be aware that lseek() returns the offset after seeking,
534 * while fseek() returns zero on success. */
535 off_t offset = lseek(fp->fd, off, whence);
537 // Be silent, it is OK for knet_seek to fail when the file is streamed
538 // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
544 else if (fp->type == KNF_TYPE_FTP)
546 if (whence==SEEK_CUR)
548 else if (whence==SEEK_SET)
550 else if ( whence==SEEK_END)
551 fp->offset = fp->file_size+off;
555 else if (fp->type == KNF_TYPE_HTTP)
557 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
558 fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
562 if (whence==SEEK_CUR)
564 else if (whence==SEEK_SET)
570 fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
574 int knet_close(knetFile *fp)
576 if (fp == 0) return 0;
577 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
579 /* On Linux/Mac, netclose() is an alias of close(), but on
580 * Windows, it is an alias of closesocket(). */
581 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
582 else netclose(fp->fd);
584 free(fp->host); free(fp->port);
585 free(fp->response); free(fp->retr); // FTP specific
586 free(fp->path); free(fp->http_host); // HTTP specific
600 buf = calloc(0x100000, 1);
602 fp = knet_open("knetfile.c", "r");
603 knet_seek(fp, 1000, SEEK_SET);
604 } else if (type == 1) { // NCBI FTP, large file
605 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
606 knet_seek(fp, 2500000000ll, SEEK_SET);
607 l = knet_read(fp, buf, 255);
608 } else if (type == 2) {
609 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
610 knet_seek(fp, 1000, SEEK_SET);
611 } else if (type == 3) {
612 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
613 knet_seek(fp, 1000, SEEK_SET);
614 } else if (type == 4) {
615 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
616 knet_read(fp, buf, 10000);
617 knet_seek(fp, 20000, SEEK_SET);
618 knet_seek(fp, 10000, SEEK_SET);
619 l = knet_read(fp, buf+10000, 10000000) + 10000;
621 if (type != 4 && type != 1) {
622 knet_read(fp, buf, 255);
625 } else write(fileno(stdout), buf, l);