3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* Probably I will not do socket programming in the next few years and
29 therefore I decide to heavily annotate this file, for Linux and
30 Windows as well. -lh3 */
39 #include <sys/types.h>
45 #include <arpa/inet.h>
46 #include <sys/socket.h>
51 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
52 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
53 * integer -1. In knetfile.c, I use "int" for socket type
54 * throughout. This should be improved to avoid confusion.
56 * In Linux/Mac, recv() and read() do almost the same thing. You can see
57 * in the header file that netread() is simply an alias of read(). In
58 * Windows, however, they are different and using recv() is mandatory.
61 /* This function tests if the file handler is ready for reading (or
62 * writing if is_read==0). */
63 static int socket_wait(int fd, int is_read)
65 fd_set fds, *fdr = 0, *fdw = 0;
68 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
71 if (is_read) fdr = &fds;
73 ret = select(fd+1, fdr, fdw, 0, &tv);
74 if (ret == -1) perror("select");
79 /* This function does not work with Windows due to the lack of
80 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
81 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
82 static int socket_connect(const char *host, const char *port)
84 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
87 struct linger lng = { 0, 0 };
88 struct addrinfo hints, *res;
89 memset(&hints, 0, sizeof(struct addrinfo));
90 hints.ai_family = AF_UNSPEC;
91 hints.ai_socktype = SOCK_STREAM;
92 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
93 * server information. */
94 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
95 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
96 /* The following two setsockopt() are used by ftplib
97 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
99 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
100 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
101 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
106 /* MinGW's printf has problem with "%lld" */
107 char *uint64tostr(char *buf, uint64_t x)
110 for (i = 0; x; x /= 10) buf[i++] = '0' + x%10;
112 for (cnt = i, i = 0; i < cnt/2; ++i) {
113 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
117 /* In windows, the first thing is to establish the TCP connection. */
118 int knet_win32_init()
121 return WSAStartup(MAKEWORD(2, 2), &wsaData);
123 void knet_win32_destroy()
127 /* A slightly modfied version of the following function also works on
128 * Mac (and presummably Linux). However, this function is not stable on
129 * my Mac. It sometimes works fine but sometimes does not. Therefore for
130 * non-Windows OS, I do not use this one. */
131 static SOCKET socket_connect(const char *host, const char *port)
133 #define __err_connect(func) do { perror(func); return -1; } while (0)
137 struct linger lng = { 0, 0 };
138 struct sockaddr_in server;
139 struct hostent *hp = 0;
141 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
142 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
143 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
145 if (isalpha(host[0])) hp = gethostbyname(host);
148 addr.s_addr = inet_addr(host);
149 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
151 if (hp == 0) __err_connect("gethost");
153 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
154 server.sin_family= AF_INET;
155 server.sin_port = htons(atoi(port));
156 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
157 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
162 static off_t my_netread(int fd, void *buf, off_t len)
164 off_t rest = len, curr, l = 0;
165 /* recv() and read() may not read the required length of data with
166 * one call. They have to be called repeatedly. */
168 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
169 curr = netread(fd, buf + l, rest);
170 /* According to the glibc manual, section 13.2, a zero returned
171 * value indicates end-of-file (EOF), which should mean that
172 * read() will not return zero if EOF has not been met but data
173 * are not immediately available. */
174 if (curr == 0) break;
175 l += curr; rest -= curr;
180 /*************************
181 * FTP specific routines *
182 *************************/
184 static int kftp_get_response(knetFile *ftp)
189 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
190 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
192 if (n >= ftp->max_response) {
193 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
194 ftp->response = realloc(ftp->response, ftp->max_response);
196 ftp->response[n++] = c;
198 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
199 && ftp->response[3] != '-') break;
204 if (n < 2) return -1;
205 ftp->response[n-2] = 0;
206 return strtol(ftp->response, &p, 0);
209 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
211 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
212 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
213 return is_get? kftp_get_response(ftp) : 0;
216 static int kftp_pasv_prep(knetFile *ftp)
220 kftp_send_cmd(ftp, "PASV\r\n", 1);
221 for (p = ftp->response; *p && *p != '('; ++p);
222 if (*p != '(') return -1;
224 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
225 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
226 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
231 static int kftp_pasv_connect(knetFile *ftp)
233 char host[80], port[10];
234 if (ftp->pasv_port == 0) {
235 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
238 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
239 sprintf(port, "%d", ftp->pasv_port);
240 ftp->fd = socket_connect(host, port);
241 if (ftp->fd == -1) return -1;
245 int kftp_connect(knetFile *ftp)
247 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
248 if (ftp->ctrl_fd == -1) return -1;
249 kftp_get_response(ftp);
250 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
251 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
252 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
256 int kftp_reconnect(knetFile *ftp)
258 if (ftp->ctrl_fd != -1) {
259 netclose(ftp->ctrl_fd);
263 return kftp_connect(ftp);
266 // initialize ->type, ->host, ->retr and ->size
267 knetFile *kftp_parse_url(const char *fn, const char *mode)
272 if (strstr(fn, "ftp://") != fn) return 0;
273 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
274 if (*p != '/') return 0;
276 fp = calloc(1, sizeof(knetFile));
277 fp->type = KNF_TYPE_FTP;
279 /* the Linux/Mac version of socket_connect() also recognizes a port
280 * like "ftp", but the Windows version does not. */
281 fp->port = strdup("21");
282 fp->host = calloc(l + 1, 1);
283 if (strchr(mode, 'c')) fp->no_reconnect = 1;
284 strncpy(fp->host, fn + 6, l);
285 fp->retr = calloc(strlen(p) + 8, 1);
286 sprintf(fp->retr, "RETR %s\r\n", p);
287 fp->size_cmd = calloc(strlen(p) + 8, 1);
288 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
292 // place ->fd at offset off
293 int kftp_connect_file(knetFile *fp)
299 if (fp->no_reconnect) kftp_get_response(fp);
302 kftp_send_cmd(fp, fp->size_cmd, 1);
303 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
305 fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
307 } else fp->file_size = file_size;
311 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
313 strcpy(tmp, "REST ");
314 uint64tostr(tmp + 5, fp->offset);
317 kftp_send_cmd(fp, tmp, 1);
319 kftp_send_cmd(fp, fp->retr, 0);
320 kftp_pasv_connect(fp);
321 ret = kftp_get_response(fp);
323 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
333 /**************************
334 * HTTP specific routines *
335 **************************/
337 knetFile *khttp_parse_url(const char *fn, const char *mode)
342 if (strstr(fn, "http://") != fn) return 0;
344 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
346 fp = calloc(1, sizeof(knetFile));
347 fp->http_host = calloc(l + 1, 1);
348 strncpy(fp->http_host, fn + 7, l);
349 fp->http_host[l] = 0;
350 for (q = fp->http_host; *q && *q != ':'; ++q);
351 if (*q == ':') *q++ = 0;
353 proxy = getenv("http_proxy");
354 // set ->host, ->port and ->path
356 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
357 fp->port = strdup(*q? q : "80");
358 fp->path = strdup(*p? p : "/");
360 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
361 for (q = fp->host; *q && *q != ':'; ++q);
362 if (*q == ':') *q++ = 0;
363 fp->port = strdup(*q? q : "80");
364 fp->path = strdup(fn);
366 fp->type = KNF_TYPE_HTTP;
367 fp->ctrl_fd = fp->fd = -1;
372 int khttp_connect_file(knetFile *fp)
376 if (fp->fd != -1) netclose(fp->fd);
377 fp->fd = socket_connect(fp->host, fp->port);
378 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
379 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
380 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
381 l += sprintf(buf + l, "\r\n");
382 netwrite(fp->fd, buf, l);
384 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
385 if (buf[l] == '\n' && l >= 3)
386 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
390 if (l < 14) { // prematured header
395 ret = strtol(buf + 8, &p, 0); // HTTP return code
396 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
397 off_t rest = fp->offset;
399 off_t l = rest < 0x10000? rest : 0x10000;
400 rest -= my_netread(fp->fd, buf, l);
402 } else if (ret != 206 && ret != 200) {
404 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
414 /********************
416 ********************/
418 knetFile *knet_open(const char *fn, const char *mode)
421 if (mode[0] != 'r') {
422 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
425 if (strstr(fn, "ftp://") == fn) {
426 fp = kftp_parse_url(fn, mode);
427 if (fp == 0) return 0;
428 if (kftp_connect(fp) == -1) {
432 kftp_connect_file(fp);
433 } else if (strstr(fn, "http://") == fn) {
434 fp = khttp_parse_url(fn, mode);
435 if (fp == 0) return 0;
436 khttp_connect_file(fp);
437 } else { // local file
439 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
440 * be undefined on some systems, although it is defined on my
441 * Mac and the Linux I have tested on. */
442 int fd = open(fn, O_RDONLY | O_BINARY);
444 int fd = open(fn, O_RDONLY);
450 fp = (knetFile*)calloc(1, sizeof(knetFile));
451 fp->type = KNF_TYPE_LOCAL;
455 if (fp && fp->fd == -1) {
462 knetFile *knet_dopen(int fd, const char *mode)
464 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
465 fp->type = KNF_TYPE_LOCAL;
470 off_t knet_read(knetFile *fp, void *buf, off_t len)
473 if (fp->fd == -1) return 0;
474 if (fp->type == KNF_TYPE_FTP) {
475 if (fp->is_ready == 0) {
476 if (!fp->no_reconnect) kftp_reconnect(fp);
477 kftp_connect_file(fp);
479 } else if (fp->type == KNF_TYPE_HTTP) {
480 if (fp->is_ready == 0)
481 khttp_connect_file(fp);
483 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
484 off_t rest = len, curr;
486 curr = read(fp->fd, buf + l, rest);
487 if (curr == 0) break;
488 l += curr; rest -= curr;
490 } else l = my_netread(fp->fd, buf, len);
495 off_t knet_seek(knetFile *fp, off_t off, int whence)
497 if (whence == SEEK_SET && off == fp->offset) return 0;
498 if (fp->type == KNF_TYPE_LOCAL) {
499 /* Be aware that lseek() returns the offset after seeking,
500 * while fseek() returns zero on success. */
501 off_t offset = lseek(fp->fd, off, whence);
508 else if (fp->type == KNF_TYPE_FTP)
510 if (whence==SEEK_CUR)
512 else if (whence==SEEK_SET)
514 else if ( whence==SEEK_END)
515 fp->offset = fp->file_size+off;
519 else if (fp->type == KNF_TYPE_HTTP)
521 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
522 fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
526 if (whence==SEEK_CUR)
528 else if (whence==SEEK_SET)
537 int knet_close(knetFile *fp)
539 if (fp == 0) return 0;
540 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
542 /* On Linux/Mac, netclose() is an alias of close(), but on
543 * Windows, it is an alias of closesocket(). */
544 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
545 else netclose(fp->fd);
547 free(fp->host); free(fp->port);
548 free(fp->response); free(fp->retr); // FTP specific
549 free(fp->path); free(fp->http_host); // HTTP specific
563 buf = calloc(0x100000, 1);
565 fp = knet_open("knetfile.c", "r");
566 knet_seek(fp, 1000, SEEK_SET);
567 } else if (type == 1) { // NCBI FTP, large file
568 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
569 knet_seek(fp, 2500000000ll, SEEK_SET);
570 l = knet_read(fp, buf, 255);
571 } else if (type == 2) {
572 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
573 knet_seek(fp, 1000, SEEK_SET);
574 } else if (type == 3) {
575 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
576 knet_seek(fp, 1000, SEEK_SET);
577 } else if (type == 4) {
578 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
579 knet_read(fp, buf, 10000);
580 knet_seek(fp, 20000, SEEK_SET);
581 knet_seek(fp, 10000, SEEK_SET);
582 l = knet_read(fp, buf+10000, 10000000) + 10000;
584 if (type != 4 && type != 1) {
585 knet_read(fp, buf, 255);
588 } else write(fileno(stdout), buf, l);