3 Copyright (c) 2008 by Genome Research Ltd (GRL).
4 2010 by Attractive Chaos <attractor@live.co.uk>
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
14 The above copyright notice and this permission notice shall be
15 included in all copies or substantial portions of the Software.
17 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
21 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
22 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
23 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 /* Probably I will not do socket programming in the next few years and
28 therefore I decide to heavily annotate this file, for Linux and
29 Windows as well. -ac */
38 #include <sys/types.h>
42 #include <arpa/inet.h>
43 #include <sys/socket.h>
48 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
49 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
50 * integer -1. In knetfile.c, I use "int" for socket type
51 * throughout. This should be improved to avoid confusion.
53 * In Linux/Mac, recv() and read() do almost the same thing. You can see
54 * in the header file that netread() is simply an alias of read(). In
55 * Windows, however, they are different and using recv() is mandatory.
58 /* This function tests if the file handler is ready for reading (or
59 * writing if is_read==0). */
60 static int socket_wait(int fd, int is_read)
62 fd_set fds, *fdr = 0, *fdw = 0;
65 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
68 if (is_read) fdr = &fds;
70 ret = select(fd+1, fdr, fdw, 0, &tv);
72 if (ret == -1) perror("select");
75 fprintf(stderr, "select time-out\n");
76 else if (ret == SOCKET_ERROR)
77 fprintf(stderr, "select: %d\n", WSAGetLastError());
83 /* This function does not work with Windows due to the lack of
84 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
85 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
86 static int socket_connect(const char *host, const char *port)
88 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
91 struct linger lng = { 0, 0 };
92 struct addrinfo hints, *res = 0;
93 memset(&hints, 0, sizeof(struct addrinfo));
94 hints.ai_family = AF_UNSPEC;
95 hints.ai_socktype = SOCK_STREAM;
96 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
97 * server information. */
98 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
99 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
100 /* The following two setsockopt() are used by ftplib
101 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
103 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
104 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
105 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
110 /* MinGW's printf has problem with "%lld" */
111 char *int64tostr(char *buf, int64_t x)
116 buf[i++] = '0' + x % 10;
120 for (cnt = i, i = 0; i < cnt/2; ++i) {
121 int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c;
126 int64_t strtoint64(const char *buf)
129 for (x = 0; *buf != '\0'; ++buf)
130 x = x * 10 + ((int64_t) *buf - 48);
133 /* In windows, the first thing is to establish the TCP connection. */
134 int knet_win32_init()
137 return WSAStartup(MAKEWORD(2, 2), &wsaData);
139 void knet_win32_destroy()
143 /* A slightly modfied version of the following function also works on
144 * Mac (and presummably Linux). However, this function is not stable on
145 * my Mac. It sometimes works fine but sometimes does not. Therefore for
146 * non-Windows OS, I do not use this one. */
147 static SOCKET socket_connect(const char *host, const char *port)
149 #define __err_connect(func) \
151 fprintf(stderr, "%s: %d\n", func, WSAGetLastError()); \
157 struct linger lng = { 0, 0 };
158 struct sockaddr_in server;
159 struct hostent *hp = 0;
161 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
162 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
163 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
165 if (isalpha(host[0])) hp = gethostbyname(host);
168 addr.s_addr = inet_addr(host);
169 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
171 if (hp == 0) __err_connect("gethost");
173 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
174 server.sin_family= AF_INET;
175 server.sin_port = htons(atoi(port));
176 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
177 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
182 static off_t my_netread(int fd, void *buf, off_t len)
184 off_t rest = len, curr, l = 0;
185 /* recv() and read() may not read the required length of data with
186 * one call. They have to be called repeatedly. */
188 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
189 curr = netread(fd, buf + l, rest);
190 /* According to the glibc manual, section 13.2, a zero returned
191 * value indicates end-of-file (EOF), which should mean that
192 * read() will not return zero if EOF has not been met but data
193 * are not immediately available. */
194 if (curr == 0) break;
195 l += curr; rest -= curr;
200 /*************************
201 * FTP specific routines *
202 *************************/
204 static int kftp_get_response(knetFile *ftp)
213 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
214 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
216 if (n >= ftp->max_response) {
217 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
218 ftp->response = realloc(ftp->response, ftp->max_response);
220 ftp->response[n++] = c;
222 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
223 && ftp->response[3] != '-') break;
228 if (n < 2) return -1;
229 ftp->response[n-2] = 0;
230 return strtol(ftp->response, &p, 0);
233 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
235 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
236 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
237 return is_get? kftp_get_response(ftp) : 0;
240 static int kftp_pasv_prep(knetFile *ftp)
244 kftp_send_cmd(ftp, "PASV\r\n", 1);
245 for (p = ftp->response; *p && *p != '('; ++p);
246 if (*p != '(') return -1;
248 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
249 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
250 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
255 static int kftp_pasv_connect(knetFile *ftp)
257 char host[80], port[10];
258 if (ftp->pasv_port == 0) {
259 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
262 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
263 sprintf(port, "%d", ftp->pasv_port);
264 ftp->fd = socket_connect(host, port);
265 if (ftp->fd == -1) return -1;
269 int kftp_connect(knetFile *ftp)
271 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
272 if (ftp->ctrl_fd == -1) return -1;
273 kftp_get_response(ftp);
274 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
275 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
276 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
280 int kftp_reconnect(knetFile *ftp)
282 if (ftp->ctrl_fd != -1) {
283 netclose(ftp->ctrl_fd);
288 return kftp_connect(ftp);
291 // initialize ->type, ->host, ->retr and ->size
292 knetFile *kftp_parse_url(const char *fn, const char *mode)
297 if (strstr(fn, "ftp://") != fn) return 0;
298 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
299 if (*p != '/') return 0;
301 fp = calloc(1, sizeof(knetFile));
302 fp->type = KNF_TYPE_FTP;
304 /* the Linux/Mac version of socket_connect() also recognizes a port
305 * like "ftp", but the Windows version does not. */
306 fp->port = strdup("21");
307 fp->host = calloc(l + 1, 1);
308 if (strchr(mode, 'c')) fp->no_reconnect = 1;
309 strncpy(fp->host, fn + 6, l);
310 fp->retr = calloc(strlen(p) + 8, 1);
311 sprintf(fp->retr, "RETR %s\r\n", p);
312 fp->size_cmd = calloc(strlen(p) + 8, 1);
313 sprintf(fp->size_cmd, "SIZE %s\r\n", p);
317 // place ->fd at offset off
318 int kftp_connect_file(knetFile *fp)
324 if (fp->no_reconnect) kftp_get_response(fp);
327 kftp_send_cmd(fp, fp->size_cmd, 1);
329 if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 )
331 fprintf(stderr,"[kftp_connect_file] %s\n", fp->response);
335 const char *p = fp->response;
336 while (*p != ' ') ++p;
337 while (*p < '0' || *p > '9') ++p;
338 file_size = strtoint64(p);
340 fp->file_size = file_size;
344 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
346 strcpy(tmp, "REST ");
347 int64tostr(tmp + 5, fp->offset);
350 kftp_send_cmd(fp, tmp, 1);
352 kftp_send_cmd(fp, fp->retr, 0);
353 kftp_pasv_connect(fp);
354 ret = kftp_get_response(fp);
356 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
366 /**************************
367 * HTTP specific routines *
368 **************************/
370 knetFile *khttp_parse_url(const char *fn, const char *mode)
375 if (strstr(fn, "http://") != fn) return 0;
377 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
379 fp = calloc(1, sizeof(knetFile));
380 fp->http_host = calloc(l + 1, 1);
381 strncpy(fp->http_host, fn + 7, l);
382 fp->http_host[l] = 0;
383 for (q = fp->http_host; *q && *q != ':'; ++q);
384 if (*q == ':') *q++ = 0;
386 proxy = getenv("http_proxy");
387 // set ->host, ->port and ->path
389 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
390 fp->port = strdup(*q? q : "80");
391 fp->path = strdup(*p? p : "/");
393 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
394 for (q = fp->host; *q && *q != ':'; ++q);
395 if (*q == ':') *q++ = 0;
396 fp->port = strdup(*q? q : "80");
397 fp->path = strdup(fn);
399 fp->type = KNF_TYPE_HTTP;
400 fp->ctrl_fd = fp->fd = -1;
405 int khttp_connect_file(knetFile *fp)
409 if (fp->fd != -1) netclose(fp->fd);
410 fp->fd = socket_connect(fp->host, fp->port);
411 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
412 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
413 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
414 l += sprintf(buf + l, "\r\n");
415 netwrite(fp->fd, buf, l);
417 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
418 if (buf[l] == '\n' && l >= 3)
419 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
423 if (l < 14) { // prematured header
428 ret = strtol(buf + 8, &p, 0); // HTTP return code
429 if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file
430 off_t rest = fp->offset;
432 off_t l = rest < 0x10000? rest : 0x10000;
433 rest -= my_netread(fp->fd, buf, l);
435 } else if (ret != 206 && ret != 200) {
437 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
447 /********************
449 ********************/
451 knetFile *knet_open(const char *fn, const char *mode)
454 if (mode[0] != 'r') {
455 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
458 if (strstr(fn, "ftp://") == fn) {
459 fp = kftp_parse_url(fn, mode);
460 if (fp == 0) return 0;
461 if (kftp_connect(fp) == -1) {
465 kftp_connect_file(fp);
466 } else if (strstr(fn, "http://") == fn) {
467 fp = khttp_parse_url(fn, mode);
468 if (fp == 0) return 0;
469 khttp_connect_file(fp);
470 } else { // local file
472 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
473 * be undefined on some systems, although it is defined on my
474 * Mac and the Linux I have tested on. */
475 int fd = open(fn, O_RDONLY | O_BINARY);
477 int fd = open(fn, O_RDONLY);
483 fp = (knetFile*)calloc(1, sizeof(knetFile));
484 fp->type = KNF_TYPE_LOCAL;
488 if (fp && fp->fd == -1) {
495 knetFile *knet_dopen(int fd, const char *mode)
497 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
498 fp->type = KNF_TYPE_LOCAL;
503 off_t knet_read(knetFile *fp, void *buf, off_t len)
506 if (fp->fd == -1) return 0;
507 if (fp->type == KNF_TYPE_FTP) {
508 if (fp->is_ready == 0) {
509 if (!fp->no_reconnect) kftp_reconnect(fp);
510 kftp_connect_file(fp);
512 } else if (fp->type == KNF_TYPE_HTTP) {
513 if (fp->is_ready == 0)
514 khttp_connect_file(fp);
516 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
517 off_t rest = len, curr;
520 curr = read(fp->fd, buf + l, rest);
521 } while (curr < 0 && EINTR == errno);
522 if (curr < 0) return -1;
523 if (curr == 0) break;
524 l += curr; rest -= curr;
526 } else l = my_netread(fp->fd, buf, len);
531 off_t knet_seek(knetFile *fp, int64_t off, int whence)
533 if (whence == SEEK_SET && off == fp->offset) return 0;
534 if (fp->type == KNF_TYPE_LOCAL) {
535 /* Be aware that lseek() returns the offset after seeking,
536 * while fseek() returns zero on success. */
537 off_t offset = lseek(fp->fd, off, whence);
539 // Be silent, it is OK for knet_seek to fail when the file is streamed
540 // fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
546 else if (fp->type == KNF_TYPE_FTP)
548 if (whence==SEEK_CUR)
550 else if (whence==SEEK_SET)
552 else if ( whence==SEEK_END)
553 fp->offset = fp->file_size+off;
557 else if (fp->type == KNF_TYPE_HTTP)
559 if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future?
560 fprintf(stderr, "[knet_seek] SEEK_END is not supported for HTTP. Offset is unchanged.\n");
564 if (whence==SEEK_CUR)
566 else if (whence==SEEK_SET)
572 fprintf(stderr,"[knet_seek] %s\n", strerror(errno));
576 int knet_close(knetFile *fp)
578 if (fp == 0) return 0;
579 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
581 /* On Linux/Mac, netclose() is an alias of close(), but on
582 * Windows, it is an alias of closesocket(). */
583 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
584 else netclose(fp->fd);
586 free(fp->host); free(fp->port);
587 free(fp->response); free(fp->retr); // FTP specific
588 free(fp->path); free(fp->http_host); // HTTP specific
602 buf = calloc(0x100000, 1);
604 fp = knet_open("knetfile.c", "r");
605 knet_seek(fp, 1000, SEEK_SET);
606 } else if (type == 1) { // NCBI FTP, large file
607 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
608 knet_seek(fp, 2500000000ll, SEEK_SET);
609 l = knet_read(fp, buf, 255);
610 } else if (type == 2) {
611 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
612 knet_seek(fp, 1000, SEEK_SET);
613 } else if (type == 3) {
614 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
615 knet_seek(fp, 1000, SEEK_SET);
616 } else if (type == 4) {
617 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
618 knet_read(fp, buf, 10000);
619 knet_seek(fp, 20000, SEEK_SET);
620 knet_seek(fp, 10000, SEEK_SET);
621 l = knet_read(fp, buf+10000, 10000000) + 10000;
623 if (type != 4 && type != 1) {
624 knet_read(fp, buf, 255);
627 } else write(fileno(stdout), buf, l);