3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* Probably I will not do socket programming in the next few years and
29 therefore I decide to heavily annotate this file, for Linux and
30 Windows as well. -lh3 */
38 #include <sys/types.h>
44 #include <arpa/inet.h>
45 #include <sys/socket.h>
50 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
51 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
52 * integer -1. In knetfile.c, I use "int" for socket type
53 * throughout. This should be improved to avoid confusion.
55 * In Linux/Mac, recv() and read() do almost the same thing. You can see
56 * in the header file that netread() is simply an alias of read(). In
57 * Windows, however, they are different and using recv() is mandatory.
60 /* This function tests if the file handler is ready for reading (or
61 * writing if is_read==0). */
62 static int socket_wait(int fd, int is_read)
64 fd_set fds, *fdr = 0, *fdw = 0;
67 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
70 if (is_read) fdr = &fds;
72 ret = select(fd+1, fdr, fdw, 0, &tv);
73 if (ret == -1) perror("select");
78 /* This function does not work with Windows due to the lack of
79 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
80 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
81 static int socket_connect(const char *host, const char *port)
83 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
86 struct linger lng = { 0, 0 };
87 struct addrinfo hints, *res;
88 memset(&hints, 0, sizeof(struct addrinfo));
89 hints.ai_family = AF_UNSPEC;
90 hints.ai_socktype = SOCK_STREAM;
91 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
92 * server information. */
93 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
94 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
95 /* The following two setsockopt() are used by ftplib
96 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
98 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
99 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
100 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
105 /* MinGW's printf has problem with "%lld" */
106 char *uint64tostr(char *buf, uint64_t x)
109 for (i = 0; x; x /= 10) buf[i++] = '0' + x%10;
111 for (cnt = i, i = 0; i < cnt/2; ++i) {
112 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
116 /* In windows, the first thing is to establish the TCP connection. */
117 int knet_win32_init()
120 return WSAStartup(MAKEWORD(2, 2), &wsaData);
122 void knet_win32_destroy()
126 /* A slightly modfied version of the following function also works on
127 * Mac (and presummably Linux). However, this function is not stable on
128 * my Mac. It sometimes works fine but sometimes does not. Therefore for
129 * non-Windows OS, I do not use this one. */
130 static SOCKET socket_connect(const char *host, const char *port)
132 #define __err_connect(func) do { perror(func); return -1; } while (0)
136 struct linger lng = { 0, 0 };
137 struct sockaddr_in server;
138 struct hostent *hp = 0;
140 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
141 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
142 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
144 if (isalpha(host[0])) hp = gethostbyname(host);
147 addr.s_addr = inet_addr(host);
148 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
150 if (hp == 0) __err_connect("gethost");
152 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
153 server.sin_family= AF_INET;
154 server.sin_port = htons(atoi(port));
155 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
156 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
161 static off_t my_netread(int fd, void *buf, off_t len)
163 off_t rest = len, curr, l = 0;
164 /* recv() and read() may not read the required length of data with
165 * one call. They have to be called repeatedly. */
167 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
168 curr = netread(fd, buf + l, rest);
169 /* According to the glibc manual, section 13.2, a zero returned
170 * value indicates end-of-file (EOF), which should mean that
171 * read() will not return zero if EOF has not been met but data
172 * are not immediately available. */
173 if (curr == 0) break;
174 l += curr; rest -= curr;
179 /*************************
180 * FTP specific routines *
181 *************************/
183 static int kftp_get_response(knetFile *ftp)
188 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
189 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
191 if (n >= ftp->max_response) {
192 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
193 ftp->response = realloc(ftp->response, ftp->max_response);
195 ftp->response[n++] = c;
197 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
198 && ftp->response[3] != '-') break;
203 if (n < 2) return -1;
204 ftp->response[n-2] = 0;
205 return strtol(ftp->response, &p, 0);
208 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
210 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
211 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
212 return is_get? kftp_get_response(ftp) : 0;
215 static int kftp_pasv_prep(knetFile *ftp)
219 kftp_send_cmd(ftp, "PASV\r\n", 1);
220 for (p = ftp->response; *p && *p != '('; ++p);
221 if (*p != '(') return -1;
223 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
224 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
225 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
230 static int kftp_pasv_connect(knetFile *ftp)
232 char host[80], port[10];
233 if (ftp->pasv_port == 0) {
234 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
237 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
238 sprintf(port, "%d", ftp->pasv_port);
239 ftp->fd = socket_connect(host, port);
240 if (ftp->fd == -1) return -1;
244 int kftp_connect(knetFile *ftp)
246 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
247 if (ftp->ctrl_fd == -1) return -1;
248 kftp_get_response(ftp);
249 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
250 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
251 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
255 int kftp_reconnect(knetFile *ftp)
257 if (ftp->ctrl_fd != -1) {
258 netclose(ftp->ctrl_fd);
262 return kftp_connect(ftp);
265 // initialize ->type, ->host, ->retr and ->size
266 knetFile *kftp_parse_url(const char *fn, const char *mode)
271 if (strstr(fn, "ftp://") != fn) return 0;
272 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
273 if (*p != '/') return 0;
275 fp = calloc(1, sizeof(knetFile));
276 fp->type = KNF_TYPE_FTP;
278 /* the Linux/Mac version of socket_connect() also recognizes a port
279 * like "ftp", but the Windows version does not. */
280 fp->port = strdup("21");
281 fp->host = calloc(l + 1, 1);
282 if (strchr(mode, 'c')) fp->no_reconnect = 1;
283 strncpy(fp->host, fn + 6, l);
284 fp->retr = calloc(strlen(p) + 8, 1);
285 sprintf(fp->retr, "RETR %s\r\n", p);
286 fp->size_cmd = calloc(strlen(p) + 8, 1);
287 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
291 // place ->fd at offset off
292 int kftp_connect_file(knetFile *fp)
298 if (fp->no_reconnect) kftp_get_response(fp);
301 kftp_send_cmd(fp, fp->size_cmd, 1);
302 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
304 fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
306 } else fp->file_size = file_size;
310 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
312 strcpy(tmp, "REST ");
313 uint64tostr(tmp + 5, fp->offset);
316 kftp_send_cmd(fp, tmp, 1);
318 kftp_send_cmd(fp, fp->retr, 0);
319 kftp_pasv_connect(fp);
320 ret = kftp_get_response(fp);
322 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
332 /**************************
333 * HTTP specific routines *
334 **************************/
336 knetFile *khttp_parse_url(const char *fn, const char *mode)
341 if (strstr(fn, "http://") != fn) return 0;
343 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
345 fp = calloc(1, sizeof(knetFile));
346 fp->http_host = calloc(l + 1, 1);
347 strncpy(fp->http_host, fn + 7, l);
348 fp->http_host[l] = 0;
349 for (q = fp->http_host; *q && *q != ':'; ++q);
350 if (*q == ':') *q++ = 0;
352 proxy = getenv("http_proxy");
353 // set ->host, ->port and ->path
355 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
356 fp->port = strdup(*q? q : "80");
357 fp->path = strdup(*p? p : "/");
359 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
360 for (q = fp->host; *q && *q != ':'; ++q);
361 if (*q == ':') *q++ = 0;
362 fp->port = strdup(*q? q : "80");
363 fp->path = strdup(fn);
365 fp->type = KNF_TYPE_HTTP;
366 fp->ctrl_fd = fp->fd = -1;
371 int khttp_connect_file(knetFile *fp)
375 if (fp->fd != -1) netclose(fp->fd);
376 fp->fd = socket_connect(fp->host, fp->port);
377 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
378 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
379 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
380 l += sprintf(buf + l, "\r\n");
381 netwrite(fp->fd, buf, l);
383 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
384 if (buf[l] == '\n' && l >= 3)
385 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
389 if (l < 14) { // prematured header
394 ret = strtol(buf + 8, &p, 0); // HTTP return code
395 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
396 off_t rest = fp->offset;
398 off_t l = rest < 0x10000? rest : 0x10000;
399 rest -= my_netread(fp->fd, buf, l);
401 } else if (ret != 206 && ret != 200) {
403 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
413 /********************
415 ********************/
417 knetFile *knet_open(const char *fn, const char *mode)
420 if (mode[0] != 'r') {
421 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
424 if (strstr(fn, "ftp://") == fn) {
425 fp = kftp_parse_url(fn, mode);
426 if (fp == 0) return 0;
427 if (kftp_connect(fp) == -1) {
431 kftp_connect_file(fp);
432 } else if (strstr(fn, "http://") == fn) {
433 fp = khttp_parse_url(fn, mode);
434 if (fp == 0) return 0;
435 khttp_connect_file(fp);
436 } else { // local file
438 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
439 * be undefined on some systems, although it is defined on my
440 * Mac and the Linux I have tested on. */
441 int fd = open(fn, O_RDONLY | O_BINARY);
443 int fd = open(fn, O_RDONLY);
449 fp = (knetFile*)calloc(1, sizeof(knetFile));
450 fp->type = KNF_TYPE_LOCAL;
454 if (fp && fp->fd == -1) {
461 knetFile *knet_dopen(int fd, const char *mode)
463 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
464 fp->type = KNF_TYPE_LOCAL;
469 off_t knet_read(knetFile *fp, void *buf, off_t len)
472 if (fp->fd == -1) return 0;
473 if (fp->type == KNF_TYPE_FTP) {
474 if (fp->is_ready == 0) {
475 if (!fp->no_reconnect) kftp_reconnect(fp);
476 kftp_connect_file(fp);
478 } else if (fp->type == KNF_TYPE_HTTP) {
479 if (fp->is_ready == 0)
480 khttp_connect_file(fp);
482 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
483 off_t rest = len, curr;
485 curr = read(fp->fd, buf + l, rest);
486 if (curr == 0) break;
487 l += curr; rest -= curr;
489 } else l = my_netread(fp->fd, buf, len);
494 off_t knet_seek(knetFile *fp, off_t off, int whence)
496 if (whence == SEEK_SET && off == fp->offset) return 0;
497 if (fp->type == KNF_TYPE_LOCAL) {
498 /* Be aware that lseek() returns the offset after seeking,
499 * while fseek() returns zero on success. */
500 off_t offset = lseek(fp->fd, off, whence);
508 else if (fp->type == KNF_TYPE_FTP)
510 if (whence==SEEK_CUR)
512 else if (whence==SEEK_SET)
514 else if ( whence==SEEK_END)
515 fp->offset = fp->file_size+off;
519 else if (fp->type == KNF_TYPE_HTTP)
521 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
522 fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
525 if (whence==SEEK_CUR)
527 else if (whence==SEEK_SET)
535 int knet_close(knetFile *fp)
537 if (fp == 0) return 0;
538 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
540 /* On Linux/Mac, netclose() is an alias of close(), but on
541 * Windows, it is an alias of closesocket(). */
542 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
543 else netclose(fp->fd);
545 free(fp->host); free(fp->port);
546 free(fp->response); free(fp->retr); // FTP specific
547 free(fp->path); free(fp->http_host); // HTTP specific
561 buf = calloc(0x100000, 1);
563 fp = knet_open("knetfile.c", "r");
564 knet_seek(fp, 1000, SEEK_SET);
565 } else if (type == 1) { // NCBI FTP, large file
566 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
567 knet_seek(fp, 2500000000ll, SEEK_SET);
568 l = knet_read(fp, buf, 255);
569 } else if (type == 2) {
570 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
571 knet_seek(fp, 1000, SEEK_SET);
572 } else if (type == 3) {
573 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
574 knet_seek(fp, 1000, SEEK_SET);
575 } else if (type == 4) {
576 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
577 knet_read(fp, buf, 10000);
578 knet_seek(fp, 20000, SEEK_SET);
579 knet_seek(fp, 10000, SEEK_SET);
580 l = knet_read(fp, buf+10000, 10000000) + 10000;
582 if (type != 4 && type != 1) {
583 knet_read(fp, buf, 255);
586 } else write(fileno(stdout), buf, l);