From: Heng Li <lh3@live.co.uk>
Date: Fri, 12 Jun 2009 23:15:24 +0000 (+0000)
Subject:  * samtools-0.1.4-8 (r338)
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=78de989ec56e7c17e1fa854cb8c1c4f9ae08856c;p=samtools.git

 * samtools-0.1.4-8 (r338)
 * parse the @RG header lines and allow to choose library at the "samtools view"
   command line
---

diff --git a/bam.c b/bam.c
index 22e09cf..e3cb6bd 100644
--- a/bam.c
+++ b/bam.c
@@ -79,6 +79,7 @@ void bam_header_destroy(bam_header_t *header)
 	}
 	free(header->text);
 #ifndef BAM_NO_HASH
+	if (header->rg2lib) bam_strmap_destroy(header->rg2lib);
 	bam_destroy_header_hash(header);
 #endif
 	free(header);
diff --git a/bam.h b/bam.h
index 9330075..c635e72 100644
--- a/bam.h
+++ b/bam.h
@@ -100,7 +100,7 @@ typedef struct {
 	int32_t n_targets;
 	char **target_name;
 	uint32_t *target_len;
-	void *hash;
+	void *hash, *rg2lib;
 	int l_text;
 	char *text;
 } bam_header_t;
@@ -317,9 +317,16 @@ extern "C" {
 
 	bam_header_t *sam_header_read(tamFile fp);
 	int sam_header_parse(bam_header_t *h);
+	int sam_header_parse_rg(bam_header_t *h);
 
 #define sam_write1(header, b) bam_view1(header, b)
 
+	int bam_strmap_put(void *strmap, const char *rg, const char *lib);
+	const char *bam_strmap_get(const void *strmap, const char *rg);
+	void *bam_strmap_dup(const void*);
+	void *bam_strmap_init();
+	void bam_strmap_destroy(void *strmap);
+
 	/*!
 	  @abstract Initialize a header structure.
 	  @return   the pointer to the header structure
@@ -593,7 +600,7 @@ extern "C" {
 
 	void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data);
 	// uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2]); // an alias of bam_aux_get()
-	uint8_t *bam_aux_get(bam1_t *b, const char tag[2]);
+	uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
 	int32_t bam_aux2i(const uint8_t *s);
 	float bam_aux2f(const uint8_t *s);
 	double bam_aux2d(const uint8_t *s);
diff --git a/bam_aux.c b/bam_aux.c
index 41be819..137f9af 100644
--- a/bam_aux.c
+++ b/bam_aux.c
@@ -1,7 +1,9 @@
 #include <ctype.h>
 #include "bam.h"
 #include "khash.h"
+typedef char *str_p;
 KHASH_MAP_INIT_STR(s, int)
+KHASH_MAP_INIT_STR(r2l, str_p)
 
 void bam_aux_append(bam1_t *b, const char tag[2], char type, int len, uint8_t *data)
 {
@@ -23,7 +25,7 @@ uint8_t *bam_aux_get_core(bam1_t *b, const char tag[2])
 	return bam_aux_get(b, tag);
 }
 */
-uint8_t *bam_aux_get(bam1_t *b, const char tag[2])
+uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
 {
 	uint8_t *s;
 	int y = tag[0]<<8 | tag[1];
@@ -158,6 +160,70 @@ char *bam_aux2Z(const uint8_t *s)
 	else return 0;
 }
 
+/******************
+ * rg2lib related *
+ ******************/
+
+int bam_strmap_put(void *rg2lib, const char *rg, const char *lib)
+{
+	int ret;
+	khint_t k;
+	khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+	char *key = strdup(rg);
+	k = kh_put(r2l, h, key, &ret);
+	if (ret) kh_val(h, k) = strdup(lib);
+	else {
+		fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg);
+		free(key);
+	}
+	return 0;
+}
+
+const char *bam_strmap_get(const void *rg2lib, const char *rg)
+{
+	const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+	khint_t k;
+	k = kh_get(r2l, h, rg);
+	if (k != kh_end(h)) return (const char*)kh_val(h, k);
+	else return 0;
+}
+
+void *bam_strmap_dup(const void *rg2lib)
+{
+	const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
+	khash_t(r2l) *g = kh_init(r2l);
+	khint_t k, l;
+	int ret;
+	for (k = kh_begin(h); k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			char *key = strdup(kh_key(h, k));
+			l = kh_put(r2l, g, key, &ret);
+			kh_val(g, l) = strdup(kh_val(h, k));
+		}
+	}
+	return g;
+}
+
+void *bam_strmap_init()
+{
+	return (void*)kh_init(r2l);
+}
+
+void bam_strmap_destroy(void *rg2lib)
+{
+	khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
+	khint_t k;
+	if (h == 0) return;
+	for (k = kh_begin(h); k < kh_end(h); ++k) {
+		if (kh_exist(h, k)) {
+			free((char*)kh_key(h, k)); free(kh_val(h, k));
+		}
+	}
+	kh_destroy(r2l, h);
+}
+
+/*** The following routines were implemented by Nils Homer for color-space support in tview ***/
+
 char bam_aux_getCSi(bam1_t *b, int i)
 {
 	uint8_t *c = bam_aux_get(b, "CS");
diff --git a/bam_import.c b/bam_import.c
index 77e3243..c085473 100644
--- a/bam_import.c
+++ b/bam_import.c
@@ -5,6 +5,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <assert.h>
+#include "kstring.h"
 #include "bam.h"
 #include "kseq.h"
 #include "khash.h"
@@ -146,6 +147,55 @@ static inline void append_text(bam_header_t *header, kstring_t *str)
 	header->text[header->l_text] = 0;
 }
 
+int sam_header_parse_rg(bam_header_t *h)
+{
+	kstring_t *rgid, *rglib;
+	char *p, *q, *s, *r;
+	int n = 0;
+
+	bam_strmap_destroy(h->rg2lib);
+	// parse @RG lines
+	h->rg2lib = bam_strmap_init();
+	rgid = calloc(1, sizeof(kstring_t));
+	rglib = calloc(1, sizeof(kstring_t));
+	s = h->text;
+	while ((s = strstr(s, "@RG")) != 0) {
+		if (rgid->l && rglib->l) {
+			bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+			++n;
+		}
+		rgid->l = rglib->l = 0;
+		s += 3;
+		r = s;
+		if ((p = strstr(s, "ID:")) != 0) {
+			q = p + 3;
+			for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+			kputsn(q, p - q, rgid);
+		} else {
+			fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
+			break;
+		}
+		if (r < p) r = p;
+		if ((p = strstr(s, "LB:")) != 0) {
+			q = p + 3;
+			for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
+			kputsn(q, p - q, rglib);
+		} else {
+			fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
+			break;
+		}
+		if (r < p) r = p;
+		s = r + 3;
+	}
+	if (rgid->l && rglib->l) {
+		bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
+		++n;
+	}
+	free(rgid->s); free(rgid);
+	free(rglib->s); free(rglib);
+	return n;
+}
+
 int sam_header_parse(bam_header_t *h)
 {
 	int i;
@@ -183,6 +233,7 @@ int sam_header_parse(bam_header_t *h)
 		s = r + 3;
 		++i;
 	}
+	sam_header_parse_rg(h);
 	return h->n_targets;
 
 header_err_ret:
diff --git a/bamtk.c b/bamtk.c
index 878562c..852213e 100644
--- a/bamtk.c
+++ b/bamtk.c
@@ -3,7 +3,7 @@
 #include "bam.h"
 
 #ifndef PACKAGE_VERSION
-#define PACKAGE_VERSION "0.1.4-7 (r337)"
+#define PACKAGE_VERSION "0.1.4-8 (r338)"
 #endif
 
 int bam_taf2baf(int argc, char *argv[]);
diff --git a/kstring.h b/kstring.h
index 47d63f7..221ade2 100644
--- a/kstring.h
+++ b/kstring.h
@@ -19,19 +19,24 @@ typedef struct __kstring_t {
 int ksprintf(kstring_t *s, const char *fmt, ...);
 int ksplit_core(char *s, int delimiter, int *_max, int **_offsets);
 
-static inline int kputs(const char *p, kstring_t *s)
+static inline int kputsn(const char *p, int l, kstring_t *s)
 {
-	int l = strlen(p);
 	if (s->l + l + 1 >= s->m) {
 		s->m = s->l + l + 2;
 		kroundup32(s->m);
 		s->s = (char*)realloc(s->s, s->m);
 	}
-	strcpy(s->s + s->l, p);
+	strncpy(s->s + s->l, p, l);
 	s->l += l;
+	s->s[s->l] = 0;
 	return l;
 }
 
+static inline int kputs(const char *p, kstring_t *s)
+{
+	return kputsn(p, strlen(p), s);
+}
+
 static inline int kputc(int c, kstring_t *s)
 {
 	if (s->l + 1 >= s->m) {
diff --git a/sam.c b/sam.c
index 04be5c5..4c16b02 100644
--- a/sam.c
+++ b/sam.c
@@ -19,6 +19,7 @@ bam_header_t *bam_header_dup(const bam_header_t *h0)
 		h->target_len[i] = h0->target_len[i];
 		h->target_name[i] = strdup(h0->target_name[i]);
 	}
+	if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib);
 	return h;
 }
 
@@ -46,6 +47,7 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
 					fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
 			} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
 		}
+		sam_header_parse_rg(fp->header);
 	} else if (mode[0] == 'w') { // write
 		fp->header = bam_header_dup((const bam_header_t*)aux);
 		if (mode[1] == 'b') { // binary
diff --git a/sam_view.c b/sam_view.c
index 4858610..50192f1 100644
--- a/sam_view.c
+++ b/sam_view.c
@@ -5,14 +5,26 @@
 #include "sam.h"
 
 static int g_min_mapQ = 0, g_flag_on = 0, g_flag_off = 0;
+static char *g_library;
 
-#define __g_skip_aln(b) (((b)->core.qual < g_min_mapQ) || ((b->core.flag & g_flag_on) != g_flag_on) \
-						 || (b->core.flag & g_flag_off))
+static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
+{
+	if (b->core.qual < g_min_mapQ || ((b->core.flag & g_flag_on) != g_flag_on) || (b->core.flag & g_flag_off))
+		return 1;
+	if (g_library) {
+		uint8_t *s = bam_aux_get(b, "RG");
+		if (s) {
+			const char *p = bam_strmap_get(h->rg2lib, s + 1);
+			return (p && strcmp(p, g_library) == 0)? 0 : 1;
+		} else return 1;
+	} else return 0;
+}
 
 // callback function for bam_fetch()
 static int view_func(const bam1_t *b, void *data)
 {
-	if (!__g_skip_aln(b)) samwrite((samfile_t*)data, b);
+	if (!__g_skip_aln(((samfile_t*)data)->header, b))
+		samwrite((samfile_t*)data, b);
 	return 0;
 }
 
@@ -26,7 +38,7 @@ int main_samview(int argc, char *argv[])
 
 	/* parse command-line options */
 	strcpy(in_mode, "r"); strcpy(out_mode, "w");
-	while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:u")) >= 0) {
+	while ((c = getopt(argc, argv, "Sbt:hHo:q:f:F:ul:")) >= 0) {
 		switch (c) {
 		case 'S': is_bamin = 0; break;
 		case 'b': is_bamout = 1; break;
@@ -38,6 +50,7 @@ int main_samview(int argc, char *argv[])
 		case 'F': g_flag_off = strtol(optarg, 0, 0); break;
 		case 'q': g_min_mapQ = atoi(optarg); break;
 		case 'u': is_uncompressed = 1; break;
+		case 'l': g_library = strdup(optarg); break;
 		default: return usage();
 		}
 	}
@@ -64,7 +77,7 @@ int main_samview(int argc, char *argv[])
 		bam1_t *b = bam_init1();
 		int r;
 		while ((r = samread(in, b)) >= 0) // read one alignment from `in'
-			if (!__g_skip_aln(b))
+			if (!__g_skip_aln(in->header, b))
 				samwrite(out, b); // write the alignment to `out'
 		if (r < -1) fprintf(stderr, "[main_samview] truncated file.\n");
 		bam_destroy1(b);
@@ -91,7 +104,7 @@ int main_samview(int argc, char *argv[])
 
 view_end:
 	// close files, free and return
-	free(fn_list); free(fn_out);
+	free(fn_list); free(fn_out); free(g_library);
 	samclose(in);
 	samclose(out);
 	return ret;