3 Copyright (c) 2008 Genome Research Ltd (GRL).
5 Permission is hereby granted, free of charge, to any person obtaining
6 a copy of this software and associated documentation files (the
7 "Software"), to deal in the Software without restriction, including
8 without limitation the rights to use, copy, modify, merge, publish,
9 distribute, sublicense, and/or sell copies of the Software, and to
10 permit persons to whom the Software is furnished to do so, subject to
11 the following conditions:
13 The above copyright notice and this permission notice shall be
14 included in all copies or substantial portions of the Software.
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 /* Contact: Heng Li <lh3@sanger.ac.uk> */
28 /* Probably I will not do socket programming in the next few years and
29 therefore I decide to heavily annotate this file, for Linux and
30 Windows as well. -lh3 */
38 #include <sys/types.h>
44 #include <arpa/inet.h>
45 #include <sys/socket.h>
50 /* In winsock.h, the type of a socket is SOCKET, which is: "typedef
51 * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed
52 * integer -1. In knetfile.c, I use "int" for socket type
53 * throughout. This should be improved to avoid confusion.
55 * In Linux/Mac, recv() and read() do almost the same thing. You can see
56 * in the header file that netread() is simply an alias of read(). In
57 * Windows, however, they are different and using recv() is mandatory.
60 /* This function tests if the file handler is ready for reading (or
61 * writing if is_read==0). */
62 static int socket_wait(int fd, int is_read)
64 fd_set fds, *fdr = 0, *fdw = 0;
67 tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out
70 if (is_read) fdr = &fds;
72 ret = select(fd+1, fdr, fdw, 0, &tv);
73 if (ret == -1) perror("select");
78 /* This function does not work with Windows due to the lack of
79 * getaddrinfo() in winsock. It is addapted from an example in "Beej's
80 * Guide to Network Programming" (http://beej.us/guide/bgnet/). */
81 static int socket_connect(const char *host, const char *port)
83 #define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0)
86 struct linger lng = { 0, 0 };
87 struct addrinfo hints, *res;
88 memset(&hints, 0, sizeof(struct addrinfo));
89 hints.ai_family = AF_UNSPEC;
90 hints.ai_socktype = SOCK_STREAM;
91 /* In Unix/Mac, getaddrinfo() is the most convenient way to get
92 * server information. */
93 if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo");
94 if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket");
95 /* The following two setsockopt() are used by ftplib
96 * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they
98 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt");
99 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt");
100 if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect");
105 /* In windows, the first thing is to establish the TCP connection. */
106 int knet_win32_init()
109 return WSAStartup(MAKEWORD(2, 2), &wsaData);
111 void knet_win32_destroy()
115 /* A slightly modfied version of the following function also works on
116 * Mac (and presummably Linux). However, this function is not stable on
117 * my Mac. It sometimes works fine but sometimes does not. Therefore for
118 * non-Windows OS, I do not use this one. */
119 static SOCKET socket_connect(const char *host, const char *port)
121 #define __err_connect(func) do { perror(func); return -1; } while (0)
125 struct linger lng = { 0, 0 };
126 struct sockaddr_in server;
127 struct hostent *hp = 0;
129 if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket");
130 if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt");
131 if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt");
133 if (isalpha(host[0])) hp = gethostbyname(host);
136 addr.s_addr = inet_addr(host);
137 hp = gethostbyaddr((char*)&addr, 4, AF_INET);
139 if (hp == 0) __err_connect("gethost");
141 server.sin_addr.s_addr = *((unsigned long*)hp->h_addr);
142 server.sin_family= AF_INET;
143 server.sin_port = htons(atoi(port));
144 if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect");
145 // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!)
150 static off_t my_netread(int fd, void *buf, off_t len)
152 off_t rest = len, curr, l = 0;
153 /* recv() and read() may not read the required length of data with
154 * one call. They have to be called repeatedly. */
156 if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading
157 curr = netread(fd, buf + l, rest);
158 /* According to the glibc manual, section 13.2, a zero returned
159 * value indicates end-of-file (EOF), which should mean that
160 * read() will not return zero if EOF has not been met but data
161 * are not immediately available. */
162 if (curr == 0) break;
163 l += curr; rest -= curr;
168 /*************************
169 * FTP specific routines *
170 *************************/
172 static int kftp_get_response(knetFile *ftp)
177 if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0;
178 while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O
180 if (n >= ftp->max_response) {
181 ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256;
182 ftp->response = realloc(ftp->response, ftp->max_response);
184 ftp->response[n++] = c;
186 if (n >= 4 && isdigit(ftp->response[0]) && isdigit(ftp->response[1]) && isdigit(ftp->response[2])
187 && ftp->response[3] != '-') break;
192 if (n < 2) return -1;
193 ftp->response[n-2] = 0;
194 return strtol(ftp->response, &p, 0);
197 static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get)
199 if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing
200 netwrite(ftp->ctrl_fd, cmd, strlen(cmd));
201 return is_get? kftp_get_response(ftp) : 0;
204 static int kftp_pasv_prep(knetFile *ftp)
208 kftp_send_cmd(ftp, "PASV\r\n", 1);
209 for (p = ftp->response; *p && *p != '('; ++p);
210 if (*p != '(') return -1;
212 sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]);
213 memcpy(ftp->pasv_ip, v, 4 * sizeof(int));
214 ftp->pasv_port = (v[4]<<8&0xff00) + v[5];
219 static int kftp_pasv_connect(knetFile *ftp)
221 char host[80], port[10];
222 if (ftp->pasv_port == 0) {
223 fprintf(stderr, "[kftp_pasv_connect] kftp_pasv_prep() is not called before hand.\n");
226 sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]);
227 sprintf(port, "%d", ftp->pasv_port);
228 ftp->fd = socket_connect(host, port);
229 if (ftp->fd == -1) return -1;
233 int kftp_connect(knetFile *ftp)
235 ftp->ctrl_fd = socket_connect(ftp->host, ftp->port);
236 if (ftp->ctrl_fd == -1) return -1;
237 kftp_get_response(ftp);
238 kftp_send_cmd(ftp, "USER anonymous\r\n", 1);
239 kftp_send_cmd(ftp, "PASS kftp@\r\n", 1);
240 kftp_send_cmd(ftp, "TYPE I\r\n", 1);
244 int kftp_reconnect(knetFile *ftp)
246 if (ftp->ctrl_fd != -1) {
247 netclose(ftp->ctrl_fd);
251 return kftp_connect(ftp);
254 // initialize ->type, ->host and ->retr
255 knetFile *kftp_parse_url(const char *fn, const char *mode)
260 if (strstr(fn, "ftp://") != fn) return 0;
261 for (p = (char*)fn + 6; *p && *p != '/'; ++p);
262 if (*p != '/') return 0;
264 fp = calloc(1, sizeof(knetFile));
265 fp->type = KNF_TYPE_FTP;
267 /* the Linux/Mac version of socket_connect() also recognizes a port
268 * like "ftp", but the Windows version does not. */
269 fp->port = strdup("21");
270 fp->host = calloc(l + 1, 1);
271 if (strchr(mode, 'c')) fp->no_reconnect = 1;
272 strncpy(fp->host, fn + 6, l);
273 fp->retr = calloc(strlen(p) + 8, 1);
274 sprintf(fp->retr, "RETR %s\r\n", p);
275 fp->seek_offset = -1;
278 // place ->fd at offset off
279 int kftp_connect_file(knetFile *fp)
284 if (fp->no_reconnect) kftp_get_response(fp);
289 sprintf(tmp, "REST %lld\r\n", (long long)fp->offset);
290 kftp_send_cmd(fp, tmp, 1);
292 kftp_send_cmd(fp, fp->retr, 0);
293 kftp_pasv_connect(fp);
294 ret = kftp_get_response(fp);
296 fprintf(stderr, "[kftp_connect_file] %s\n", fp->response);
305 /**************************
306 * HTTP specific routines *
307 **************************/
309 knetFile *khttp_parse_url(const char *fn, const char *mode)
314 if (strstr(fn, "http://") != fn) return 0;
316 for (p = (char*)fn + 7; *p && *p != '/'; ++p);
318 fp = calloc(1, sizeof(knetFile));
319 fp->http_host = calloc(l + 1, 1);
320 strncpy(fp->http_host, fn + 7, l);
321 fp->http_host[l] = 0;
322 for (q = fp->http_host; *q && *q != ':'; ++q);
323 if (*q == ':') *q++ = 0;
325 proxy = getenv("http_proxy");
326 // set ->host, ->port and ->path
328 fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name.
329 fp->port = strdup(*q? q : "80");
330 fp->path = strdup(*p? p : "/");
332 fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy);
333 for (q = fp->host; *q && *q != ':'; ++q);
334 if (*q == ':') *q++ = 0;
335 fp->port = strdup(*q? q : "80");
336 fp->path = strdup(fn);
338 fp->type = KNF_TYPE_HTTP;
339 fp->ctrl_fd = fp->fd = -1;
340 fp->seek_offset = -1;
344 int khttp_connect_file(knetFile *fp)
348 if (fp->fd != -1) netclose(fp->fd);
349 fp->fd = socket_connect(fp->host, fp->port);
350 buf = calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough.
351 l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host);
353 l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset);
354 l += sprintf(buf + l, "\r\n");
355 netwrite(fp->fd, buf, l);
357 while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency
358 if (buf[l] == '\n' && l >= 3)
359 if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break;
363 if (l < 14) { // prematured header
368 ret = strtol(buf + 8, &p, 0); // HTTP return code
369 if (ret == 200 && fp->offset) { // 200 (complete result); then skip beginning of the file
370 off_t rest = fp->offset;
372 off_t l = rest < 0x10000? rest : 0x10000;
373 rest -= my_netread(fp->fd, buf, l);
375 } else if (ret != 206 && ret != 200) {
377 fprintf(stderr, "[khttp_connect_file] fail to open file (HTTP code: %d).\n", ret);
387 /********************
389 ********************/
391 knetFile *knet_open(const char *fn, const char *mode)
394 if (mode[0] != 'r') {
395 fprintf(stderr, "[kftp_open] only mode \"r\" is supported.\n");
398 if (strstr(fn, "ftp://") == fn) {
399 fp = kftp_parse_url(fn, mode);
400 if (fp == 0) return 0;
401 if (kftp_connect(fp) == -1) {
405 kftp_connect_file(fp);
406 } else if (strstr(fn, "http://") == fn) {
407 fp = khttp_parse_url(fn, mode);
408 if (fp == 0) return 0;
409 khttp_connect_file(fp);
410 } else { // local file
412 /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may
413 * be undefined on some systems, although it is defined on my
414 * Mac and the Linux I have tested on. */
415 int fd = open(fn, O_RDONLY | O_BINARY);
417 int fd = open(fn, O_RDONLY);
423 fp = (knetFile*)calloc(1, sizeof(knetFile));
424 fp->type = KNF_TYPE_LOCAL;
428 if (fp && fp->fd == -1) {
435 knetFile *knet_dopen(int fd, const char *mode)
437 knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile));
438 fp->type = KNF_TYPE_LOCAL;
443 off_t knet_read(knetFile *fp, void *buf, off_t len)
446 if (fp->fd == -1) return 0;
447 if (fp->type == KNF_TYPE_FTP) {
448 if (fp->is_ready == 0) {
449 if (!fp->no_reconnect) kftp_reconnect(fp);
450 kftp_connect_file(fp);
452 } else if (fp->type == KNF_TYPE_HTTP) {
453 if (fp->is_ready == 0)
454 khttp_connect_file(fp);
456 if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX
457 off_t rest = len, curr;
459 curr = read(fp->fd, buf + l, rest);
460 if (curr == 0) break;
461 l += curr; rest -= curr;
463 } else l = my_netread(fp->fd, buf, len);
468 int knet_seek(knetFile *fp, off_t off, int whence)
470 if (whence == SEEK_SET && off == fp->offset) return 0;
471 if (fp->type == KNF_TYPE_LOCAL) {
472 /* Be aware that lseek() returns the offset after seeking,
473 * while fseek() returns zero on success. */
474 off_t offset = lseek(fp->fd, off, whence);
481 } else if (fp->type == KNF_TYPE_FTP || fp->type == KNF_TYPE_HTTP) {
482 if (whence != SEEK_SET) { // FIXME: we can surely allow SEEK_CUR and SEEK_END in future
483 fprintf(stderr, "[knet_seek] only SEEK_SET is supported for FTP/HTTP. Offset is unchanged.\n");
493 int knet_close(knetFile *fp)
495 if (fp == 0) return 0;
496 if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific
498 /* On Linux/Mac, netclose() is an alias of close(), but on
499 * Windows, it is an alias of closesocket(). */
500 if (fp->type == KNF_TYPE_LOCAL) close(fp->fd);
501 else netclose(fp->fd);
503 free(fp->host); free(fp->port);
504 free(fp->response); free(fp->retr); // FTP specific
505 free(fp->path); free(fp->http_host); // HTTP specific
519 buf = calloc(0x100000, 1);
521 fp = knet_open("knetfile.c", "r");
522 knet_seek(fp, 1000, SEEK_SET);
523 } else if (type == 1) { // NCBI FTP, large file
524 fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r");
525 knet_seek(fp, 2500000000ll, SEEK_SET);
526 l = knet_read(fp, buf, 255);
527 } else if (type == 2) {
528 fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r");
529 knet_seek(fp, 1000, SEEK_SET);
530 } else if (type == 3) {
531 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r");
532 knet_seek(fp, 1000, SEEK_SET);
533 } else if (type == 4) {
534 fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r");
535 knet_read(fp, buf, 10000);
536 knet_seek(fp, 20000, SEEK_SET);
537 knet_seek(fp, 10000, SEEK_SET);
538 l = knet_read(fp, buf+10000, 10000000) + 10000;
540 if (type != 4 && type != 1) {
541 knet_read(fp, buf, 255);
544 } else write(fileno(stdout), buf, l);