From 0f0dd167be99f185b327beee5c49363ae2e1c8b2 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 5 Dec 2008 03:03:11 +0000 Subject: [PATCH] added bed2tag_contigs.c git-svn-id: http://biopieces.googlecode.com/svn/trunk@331 74ccb610-7750-0410-82ae-013aeee3265d --- code_c/Maasha/src/Makefile | 6 +- code_c/Maasha/src/bed2tag_contigs.c | 134 +++++++++++++++++++++++++++ code_c/Maasha/src/inc/barray.h | 3 + code_c/Maasha/src/lib/barray.c | 22 ++++- code_c/Maasha/src/test/test_barray.c | 40 +++++++- 5 files changed, 201 insertions(+), 4 deletions(-) create mode 100644 code_c/Maasha/src/bed2tag_contigs.c diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 884817b..d1e8d84 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -11,7 +11,7 @@ TEST_DIR = test/ INC = -I $(INC_DIR) LIB = -lm $(LIB_DIR)*.o -all: libs utest bed2fixedstep bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic +all: libs utest bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic libs: cd $(LIB_DIR) && ${MAKE} all @@ -22,6 +22,9 @@ utest: bed2fixedstep: bed2fixedstep.c $(CC) $(Cflags) $(INC) $(LIB) bed2fixedstep.c -o bed2fixedstep +bed2tag_contigs: bed2tag_contigs.c + $(CC) $(Cflags) $(INC) $(LIB) bed2tag_contigs.c -o bed2tag_contigs + bed_sort: bed_sort.c $(CC) $(Cflags) $(INC) $(LIB) bed_sort.c -o bed_sort @@ -41,6 +44,7 @@ clean: cd $(LIB_DIR) && ${MAKE} clean cd $(TEST_DIR) && ${MAKE} clean rm bed2fixedstep + rm bed2tag_contigs rm bed_sort rm bipartite_scan rm bipartite_decode diff --git a/code_c/Maasha/src/bed2tag_contigs.c b/code_c/Maasha/src/bed2tag_contigs.c new file mode 100644 index 0000000..ae61b35 --- /dev/null +++ b/code_c/Maasha/src/bed2tag_contigs.c @@ -0,0 +1,134 @@ +#include "common.h" +#include "mem.h" +#include "list.h" +#include "ucsc.h" +#include "hash.h" +#include "barray.h" + +#define BED_COLS 5 +#define HASH_SIZE 8 +#define BARRAY_SIZE ( 1 << 16 ) + + +static void usage() +{ + fprintf( stderr, + "\n" + "bed2tag_contigs assembles overlapping BED type entries into tag\n" + "contigs if the clone count (assumed to be appended to the Q_ID field) of each\n" + "entry is higher than the times the entry was mapped to the genome (this\n" + "information is assumed to be the SCORE field). A tag contig score is calculated\n" + "as the highest density of overlapping tags (assuming that these tags are\n" + "uniquely mapping):\n" + "\n" + " tags ----------- Q_ID: ID_2\n" + " ------------- Q_ID: ID_3\n" + " --------------- Q_ID: ID_1\n" + " ------------ Q_ID: ID_2\n" + " tag contig =============================\n" + " scores 22555566666444411333322222222\n" + "\n" + " tag contig score = 6\n" + "\n" + "Thus, tags are only assembled into tag contigs if their expression level is\n" + "higher than their repetitiveness; a tag sequenced 10 times, i.e. with a clone\n" + "count of 10, will only be considered if was found in the genome fewer than 10\n" + "times, i.e. have a SCORE below 10.\n" + "\n" + "A forthrunning number prefixed with TC is added as Q_ID for the resulting tag\n" + "contigs.\n" + "\n" + "Usage: bed2tag_contigs < > \n\n" + ); + + exit( EXIT_FAILURE ); +} + + +long get_score( char *str ) +{ + /* Martin A. Hansen, December 2008. */ + + /* Extract the last decimal number after _ + * in a string and return that. If no number + * was found return 1. */ + + char *c; + long score = 1; + + if ( ( c = strrchr( str, '_' ) ) != NULL ) { + score = strtol( &c[ 1 ], NULL , 10 ); + } + + return score; +} + + +int main( int argc, char *argv[] ) +{ + bed_entry *entry = NULL; + hash *chr_hash = NULL; + hash_elem *bucket = NULL; + barray *ba = NULL; + ushort score = 0; + size_t i = 0; + char *chr = NULL; + size_t beg = 0; + size_t end = 0; + size_t pos = 0; + ushort max = 0; + size_t id = 0; + + entry = bed_entry_new( BED_COLS ); + chr_hash = hash_new( HASH_SIZE ); + + if ( isatty( fileno( stdin ) ) ) { + usage(); + } + + while ( ( bed_entry_get( stdin, &entry ) ) ) + { +// bed_entry_put( entry, entry->cols ); + + ba = ( barray * ) hash_get( chr_hash, entry->chr ); + + if ( ba == NULL ) + { + ba = barray_new( BARRAY_SIZE ); + + hash_add( chr_hash, entry->chr, ba ); + } + + score = ( ushort ) get_score( entry->q_id ); + + if ( score > entry->score ) { + barray_interval_inc( ba, entry->chr_beg, entry->chr_end - 1, ( short ) score / entry->score ); + } + } + +// barray_print( ba ); + + for ( i = 0; i < chr_hash->table_size; i++ ) + { + for ( bucket = chr_hash->table[ i ]; bucket != NULL; bucket = bucket->next ) + { + chr = bucket->key; + ba = ( barray * ) bucket->val; + + pos = 0; + + while ( barray_interval_scan( ba, &pos, &beg, &end ) ) + { + max = barray_interval_max( ba, beg, end ); + +// printf( "chr: %s pos: %zu beg: %zu end: %zu max: %hd\n", chr, pos, beg, end, max ); + + printf( "%s\t%zu\t%zu\tID_%08zu\t%hd\n", chr, beg, end + 1, id, max ); + + id++; + } + } + } + + return EXIT_SUCCESS; +} diff --git a/code_c/Maasha/src/inc/barray.h b/code_c/Maasha/src/inc/barray.h index 9d1a0b4..42e7ee6 100644 --- a/code_c/Maasha/src/inc/barray.h +++ b/code_c/Maasha/src/inc/barray.h @@ -38,6 +38,9 @@ void barray_interval_inc( barray *ba, size_t beg, size_t end, ushort score ); * for the next interval of non-zero values. */ bool barray_interval_scan( barray *ba, size_t *pos_pt, size_t *beg_pt, size_t *end_pt ); +/* Locate the max value in an interval within a byte array. */ +ushort barray_interval_max( barray *ba, size_t beg, size_t end ); + /* Deallocates a byte array and set it to NULL. */ void barray_destroy( barray **ba_ppt ); diff --git a/code_c/Maasha/src/lib/barray.c b/code_c/Maasha/src/lib/barray.c index 08fdc02..474e56d 100644 --- a/code_c/Maasha/src/lib/barray.c +++ b/code_c/Maasha/src/lib/barray.c @@ -64,7 +64,7 @@ void barray_print( barray *ba ) int i; - printf( "\nnmemb: %zu end: %zu\n", ba->nmemb, ba->end ); + printf( "\nba->nmemb: %zu ba->end: %zu\n", ba->nmemb, ba->end ); for ( i = 0; i < ba->nmemb; i++ ) { printf( "%d: %d\n", i, ba->array[ i ] ); @@ -82,6 +82,7 @@ void barray_interval_inc( barray *ba, size_t beg, size_t end, ushort score ) assert( beg >= 0 ); assert( end >= 0 ); assert( end >= beg ); + assert( score > 0 ); if ( end > ba->nmemb ) { barray_resize( ba, barray_new_size( end ) ); @@ -108,7 +109,7 @@ bool barray_interval_scan( barray *ba, size_t *pos_pt, size_t *beg_pt, size_t *e size_t beg = *beg_pt; size_t end = *end_pt; - if ( pos > ba->end ) { + if ( pos > ba->end || ba->end == 0 ) { return FALSE; } @@ -138,6 +139,23 @@ bool barray_interval_scan( barray *ba, size_t *pos_pt, size_t *beg_pt, size_t *e } +ushort barray_interval_max( barray *ba, size_t beg, size_t end ) +{ + /* Martin A. Hansen, December 2008. */ + + /* Locate the max value in an interval within a byte array. */ + + size_t i = 0; + ushort max = 0; + + for ( i = beg; i <= end; i++ ) { + max = MAX( max, ba->array[ i ] ); + } + + return max; +} + + void barray_destroy( barray **ba_ppt ) { /* Martin A. Hansen, November 2008. */ diff --git a/code_c/Maasha/src/test/test_barray.c b/code_c/Maasha/src/test/test_barray.c index 10aa9fe..c7d2608 100644 --- a/code_c/Maasha/src/test/test_barray.c +++ b/code_c/Maasha/src/test/test_barray.c @@ -9,6 +9,7 @@ static void test_barray_resize(); static void test_barray_print(); static void test_barray_interval_inc(); static void test_barray_interval_scan(); +static void test_barray_interval_max(); static void test_barray_destroy(); @@ -22,6 +23,7 @@ int main() test_barray_print(); test_barray_interval_inc(); test_barray_interval_scan(); + test_barray_interval_max(); test_barray_destroy(); fprintf( stderr, "Done\n\n" ); @@ -123,10 +125,10 @@ void test_barray_interval_scan() ba = barray_new( nmemb ); +/* barray_interval_inc( ba, 1, 2, 1 ); barray_interval_inc( ba, 4, 5, 1 ); -/* barray_interval_inc( ba, 0, 0, 3 ); barray_interval_inc( ba, 0, 3, 3 ); barray_interval_inc( ba, 9, 9, 3 ); @@ -135,10 +137,46 @@ void test_barray_interval_scan() barray_interval_inc( ba, 25, 35, 2 ); */ +// barray_print( ba ); + while ( barray_interval_scan( ba, &pos, &beg, &end ) ) { // printf( "beg: %zu end: %zu\n", beg, end ); } + fprintf( stderr, "OK\n" ); +} + + +void test_barray_interval_max() +{ + fprintf( stderr, " Testing barray_interval_max ... " ); + + size_t nmemb = 100; + size_t pos = 0; + size_t beg = 0; + size_t end = 0; + ushort max = 0; + barray *ba = NULL; + + ba = barray_new( nmemb ); + + barray_interval_inc( ba, 1, 2, 1 ); + barray_interval_inc( ba, 4, 5, 1 ); + barray_interval_inc( ba, 0, 0, 3 ); + barray_interval_inc( ba, 0, 3, 3 ); + barray_interval_inc( ba, 9, 9, 3 ); + barray_interval_inc( ba, 11, 11, 3 ); + barray_interval_inc( ba, 19, 29, 3 ); + barray_interval_inc( ba, 25, 35, 2 ); + barray_interval_inc( ba, 35, 35, 10 ); + + while ( barray_interval_scan( ba, &pos, &beg, &end ) ) + { + max = barray_interval_max( ba, beg, end ); + +// printf( "beg: %zu end: %zu max: %hd\n", beg, end, max ); + } + // barray_print( ba ); fprintf( stderr, "OK\n" ); -- 2.39.2