From: martinahansen Date: Thu, 4 Dec 2008 04:05:27 +0000 (+0000) Subject: added bed2fixedstep.c X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=1bd2b17face6ba73c6b8054cdb26c761198b4fb4;p=biopieces.git added bed2fixedstep.c git-svn-id: http://biopieces.googlecode.com/svn/trunk@327 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 983bab7..884817b 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -11,7 +11,7 @@ TEST_DIR = test/ INC = -I $(INC_DIR) LIB = -lm $(LIB_DIR)*.o -all: libs utest bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic +all: libs utest bed2fixedstep bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic libs: cd $(LIB_DIR) && ${MAKE} all @@ -19,6 +19,9 @@ libs: utest: cd $(TEST_DIR) && ${MAKE} all +bed2fixedstep: bed2fixedstep.c + $(CC) $(Cflags) $(INC) $(LIB) bed2fixedstep.c -o bed2fixedstep + bed_sort: bed_sort.c $(CC) $(Cflags) $(INC) $(LIB) bed_sort.c -o bed_sort @@ -37,6 +40,7 @@ repeat-O-matic: repeat-O-matic.c clean: cd $(LIB_DIR) && ${MAKE} clean cd $(TEST_DIR) && ${MAKE} clean + rm bed2fixedstep rm bed_sort rm bipartite_scan rm bipartite_decode diff --git a/code_c/Maasha/src/bed2fixedstep.c b/code_c/Maasha/src/bed2fixedstep.c new file mode 100644 index 0000000..a060437 --- /dev/null +++ b/code_c/Maasha/src/bed2fixedstep.c @@ -0,0 +1,100 @@ +#include "common.h" +#include "mem.h" +#include "filesys.h" +#include "list.h" +#include "ucsc.h" +#include "hash.h" +#include "barray.h" + +#define BED_COLS 5 +#define HASH_SIZE 8 +#define BARRAY_SIZE ( 1 << 16 ) + + +long get_score( char *str ) +{ + /* Martin A. Hansen, December 2008. */ + + /* Extract the last decimal number after _ + * in a string and return that. If no number + * was found return 1. */ + + char *c; + long score = 1; + + if ( ( c = strrchr( str, '_' ) ) != NULL ) { + score = strtol( &c[ 1 ], NULL , 10 ); + } + + return score; +} + + +int main( int argc, char *argv[] ) +{ + char *file = NULL; + FILE *fp = NULL; + bed_entry *entry = NULL; + hash *chr_hash = NULL; + hash_elem *bucket = NULL; + barray *ba = NULL; + ushort score = 0; + size_t i = 0; + size_t j = 0; + char *chr = NULL; + size_t beg = 0; + size_t end = 0; + size_t pos = 0; + + entry = bed_entry_new( BED_COLS ); + chr_hash = hash_new( HASH_SIZE ); + + file = argv[ argc - 1 ]; + fp = read_open( file ); + + while ( ( bed_entry_get( fp, &entry ) ) ) + { +// bed_entry_put( entry, entry->cols ); + + ba = ( barray * ) hash_get( chr_hash, entry->chr ); + + if ( ba == NULL ) + { + ba = barray_new( BARRAY_SIZE ); + + hash_add( chr_hash, entry->chr, ba ); + } + + score = ( ushort ) get_score( entry->q_id ); + + barray_interval_inc( ba, entry->chr_beg, entry->chr_end - 1, score ); + } + + close_stream( fp ); + +// barray_print( ba ); + + for ( i = 0; i < chr_hash->table_size; i++ ) + { + for ( bucket = chr_hash->table[ i ]; bucket != NULL; bucket = bucket->next ) + { + chr = bucket->key; + ba = ( barray * ) bucket->val; + + pos = 0; + + while ( barray_interval_scan( ba, &pos, &beg, &end ) ) + { +// printf( "chr: %s pos: %zu beg: %zu end: %zu\n", chr, pos, beg, end ); + + printf( "fixedStep chrom=%s start=%zu step=1\n", chr, beg ); + + for ( j = beg; j <= end; j++ ) { + printf( "%hd\n", ba->array[ j ] ); + } + } + } + } + + return EXIT_SUCCESS; +} diff --git a/code_c/Maasha/src/inc/hash.h b/code_c/Maasha/src/inc/hash.h index 50ef219..28a9197 100644 --- a/code_c/Maasha/src/inc/hash.h +++ b/code_c/Maasha/src/inc/hash.h @@ -13,10 +13,12 @@ typedef struct _hash_elem hash_elem; /* Structure of a generic hash. */ struct _hash { - hash_elem **table; /* Hash table. */ - size_t mask; /* Mask to trim hashed keys. */ - size_t table_size; /* Size of hash table. */ - size_t nmemb; /* Number of elements in hash table. */ + hash_elem **table; /* Hash table. */ + size_t mask; /* Mask to trim hashed keys. */ + size_t table_size; /* Size of hash table. */ + size_t nmemb; /* Number of elements in hash table. */ + size_t index_table; /* Index for iterating hash table. */ + hash_elem *index_bucket; /* Index for iterating buckets. */ }; typedef struct _hash hash; @@ -39,6 +41,9 @@ void *hash_get( hash *hash_pt, char *key ); /* Lookup a key in a given hash and return the hash element - or NULL if not found. */ hash_elem *hash_elem_get( hash *hash_pt, char *key ); +/* Get the next key/value pair from a hash table. */ +bool hash_each( hash *hash_pt, char **key_ppt, void *val ); + /* Deallocate memory for hash and all hash elements. */ void hash_destroy( hash *hash_pt ); diff --git a/code_c/Maasha/src/inc/ucsc.h b/code_c/Maasha/src/inc/ucsc.h index 291c05b..b932811 100644 --- a/code_c/Maasha/src/inc/ucsc.h +++ b/code_c/Maasha/src/inc/ucsc.h @@ -33,55 +33,55 @@ typedef struct _bed_entry bed_entry; bed_entry *bed_entry_new( const int cols ); /* Free memory for a BED entry. */ -void bed_entry_destroy( bed_entry *entry ); +void bed_entry_destroy( bed_entry *entry ); -/* Get next BED entry of a given number of columns from a file pointer. */ -bed_entry *bed_entry_get( FILE *fp, const int cols ); +/* Get next BED entry from a file stream. */ +bool bed_entry_get( FILE *fp, bed_entry **entry_ppt ); /* Get a singly linked list with all BED entries (of a given number of coluns */ /* from a specified file. */ -list_sl *bed_entries_get( char *path, const int cols ); +list_sl *bed_entries_get( char *path, const int cols ); /* Output a given number of columns from a BED entry to stdout. */ -void bed_entry_put( bed_entry *entry, int cols ); +void bed_entry_put( bed_entry *entry, int cols ); /* Output a given number of columns from all BED entries */ /* in a singly linked list. */ -void bed_entries_put( list_sl *entries, int cols ); +void bed_entries_put( list_sl *entries, int cols ); /* Free memory for all BED entries and list nodes. */ -void bed_entries_destroy( list_sl **entries_ppt ); +void bed_entries_destroy( list_sl **entries_ppt ); /* Given a path to a BED file, read the given number of cols */ /* according to the begin position. The result is written to stdout. */ -void bed_file_sort_beg( char *path, int cols ); +void bed_file_sort_beg( char *path, int cols ); /* Given a path to a BED file, read the given number of cols */ /* according to the strand AND begin position. The result is written to stdout. */ -void bed_file_sort_strand_beg( char *path, int cols ); +void bed_file_sort_strand_beg( char *path, int cols ); /* Given a path to a BED file, read the given number of cols */ /* according to the chromosome AND begin position. The result is written to stdout. */ -void bed_file_sort_chr_beg( char *path, int cols ); +void bed_file_sort_chr_beg( char *path, int cols ); /* Given a path to a BED file, read the given number of cols */ /* according to the chromosome AND strand AND begin position. The result is written to stdout. */ -void bed_file_sort_chr_strand_beg( char *path, int cols ); +void bed_file_sort_chr_strand_beg( char *path, int cols ); /* Compare function for sorting a singly linked list of BED entries */ /* according to begin position. */ -int cmp_bed_sort_beg( const void *a, const void *b ); +int cmp_bed_sort_beg( const void *a, const void *b ); /* Compare function for sorting a singly linked list of BED entries */ /* according to strand AND begin position. */ -int cmp_bed_sort_strand_beg( const void *a, const void *b ); +int cmp_bed_sort_strand_beg( const void *a, const void *b ); /* Compare function for sorting a singly linked list of BED entries */ /* according to chromosome name AND begin position. */ -int cmp_bed_sort_chr_beg( const void *a, const void *b ); +int cmp_bed_sort_chr_beg( const void *a, const void *b ); /* Compare function for sorting a singly linked list of BED entries */ /* according to chromosome name AND strand AND begin position. */ -int cmp_bed_sort_chr_strand_beg( const void *a, const void *b ); +int cmp_bed_sort_chr_strand_beg( const void *a, const void *b ); diff --git a/code_c/Maasha/src/lib/barray.c b/code_c/Maasha/src/lib/barray.c index fdb8620..68ee72e 100644 --- a/code_c/Maasha/src/lib/barray.c +++ b/code_c/Maasha/src/lib/barray.c @@ -115,17 +115,18 @@ bool barray_interval_scan( barray *ba, size_t *pos_pt, size_t *beg_pt, size_t *e beg = pos; - while ( pos < ba->end && ba->array[ pos ] != 0 ) { + while ( pos <= ba->end && ba->array[ pos ] != 0 ) { pos++; } - end = pos - 1; + end = pos; + if ( end >= beg ) { *pos_pt = pos; *beg_pt = beg; - *end_pt = end; + *end_pt = end - 1; return TRUE; } diff --git a/code_c/Maasha/src/lib/hash.c b/code_c/Maasha/src/lib/hash.c index 5d67fb0..b9ed9e3 100644 --- a/code_c/Maasha/src/lib/hash.c +++ b/code_c/Maasha/src/lib/hash.c @@ -19,10 +19,13 @@ hash *hash_new( size_t size ) table_size = 1 << size; /* table_size = ( 2 ** size ) */ - new_hash->table_size = table_size; - new_hash->mask = table_size - 1; - new_hash->table = mem_get( sizeof( hash_elem * ) * table_size ); - new_hash->nmemb = 0; + new_hash->table_size = table_size; + new_hash->mask = table_size - 1; + new_hash->table = mem_get( sizeof( hash_elem * ) * table_size ); + new_hash->nmemb = 0; + new_hash->index_table = 0; + new_hash->index_bucket = mem_get( sizeof( hash_elem ) ); + new_hash->index_bucket = NULL; return new_hash; } @@ -122,6 +125,57 @@ hash_elem *hash_elem_get( hash *hash_pt, char *key ) } +bool hash_each( hash *hash_pt, char **key_ppt, void *val ) +{ + /* Martin A. Hansen, December 2008. */ + + /* Get the next key/value pair from a hash table. */ + + char *key = *key_ppt; + + printf( "\nhash_each INIT -> i: %zu he: %p\n", hash_pt->index_table, hash_pt->index_bucket ); + + if ( hash_pt->index_bucket != NULL ) + { + key = hash_pt->index_bucket->key; + val = hash_pt->index_bucket->val; + + hash_pt->index_bucket = hash_pt->index_bucket->next; + + *key_ppt = key; + + printf( "\nhash_each BUCKET -> i: %zu he: %p\n", hash_pt->index_table, hash_pt->index_bucket ); + return TRUE; + } + + while ( hash_pt->index_table < hash_pt->table_size ) + { + hash_pt->index_bucket = hash_pt->table[ hash_pt->index_table ]; + + if ( hash_pt->index_bucket != NULL ) + { + key = hash_pt->index_bucket->key; + val = hash_pt->index_bucket->val; + + hash_pt->index_bucket = hash_pt->index_bucket->next; + + *key_ppt = key; + + printf( "hash_each TABLE table[ %zu ]\n", hash_pt->index_table ); + return TRUE; + } + + hash_pt->index_table++; + } + + printf( "\nhash_each FALSE -> i: %zu he: %p\n", hash_pt->index_table, hash_pt->index_bucket ); + + // RESET ITERATORS! + + return FALSE; +} + + void hash_destroy( hash *hash_pt ) { /* Martin A. Hansen, June 2008 */ diff --git a/code_c/Maasha/src/lib/ucsc.c b/code_c/Maasha/src/lib/ucsc.c index 89d8083..8b32428 100644 --- a/code_c/Maasha/src/lib/ucsc.c +++ b/code_c/Maasha/src/lib/ucsc.c @@ -19,10 +19,12 @@ bed_entry *bed_entry_new( const int cols ) bed_entry *entry = mem_get( sizeof( bed_entry ) ); entry->cols = cols; - entry->chr = mem_get( BED_CHR_MAX ); + entry->chr = NULL; entry->chr_beg = 0; entry->chr_end = 0; + entry->chr = mem_get( BED_CHR_MAX ); + if ( cols == 3 ) { return entry; } @@ -62,9 +64,7 @@ void bed_entry_destroy( bed_entry *entry ) /* Free memory for a BED entry. */ - int cols = entry->cols; - - if ( cols > 6 ) + if ( entry->cols > 6 ) { free( entry->itemrgb ); free( entry->blocksizes ); @@ -72,7 +72,7 @@ void bed_entry_destroy( bed_entry *entry ) free( entry->q_id ); free( entry->chr ); } - else if ( cols > 3 ) + else if ( entry->cols > 3 ) { free( entry->q_id ); free( entry->chr ); @@ -86,26 +86,18 @@ void bed_entry_destroy( bed_entry *entry ) } -bed_entry *bed_entry_get( FILE *fp, int cols ) +bool bed_entry_get( FILE *fp, bed_entry **entry_ppt ) { /* Martin A. Hansen, September 2008 */ - /* Get next BED entry of a given number of columns from a file pointer. */ + /* Get next BED entry from a file stream. */ - bed_entry *entry = bed_entry_new( cols ); + bed_entry *entry = *entry_ppt; char buffer[ BED_BUFFER ]; - assert( cols == 0 || cols == 3 || cols == 4 || cols == 5 || cols == 6 || cols == 12 ); - if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) { - if ( ! cols ) - { - cols = 1 + strchr_total( buffer, '\t' ); - entry->cols = cols; - } - - if ( cols == 3 ) + if ( entry->cols == 3 ) { sscanf( buffer, @@ -115,10 +107,10 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) &entry->chr_end ); - return entry; + return TRUE; } - if ( cols == 4 ) + if ( entry->cols == 4 ) { sscanf( buffer, @@ -129,10 +121,10 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) entry->q_id ); - return entry; + return TRUE; } - if ( cols == 5 ) + if ( entry->cols == 5 ) { sscanf( buffer, @@ -144,10 +136,10 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) &entry->score ); - return entry; + return TRUE; } - if ( cols == 6 ) + if ( entry->cols == 6 ) { sscanf( buffer, @@ -160,10 +152,10 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) &entry->strand ); - return entry; + return TRUE; } - if ( cols == 12 ) + if ( entry->cols == 12 ) { sscanf( buffer, @@ -182,11 +174,11 @@ bed_entry *bed_entry_get( FILE *fp, int cols ) entry->q_begs ); - return entry; + return TRUE; } } - return NULL; + return FALSE; } @@ -203,22 +195,24 @@ list_sl *bed_entries_get( char *path, const int cols ) bed_entry *entry = NULL; FILE *fp = NULL; + entry = bed_entry_new( cols ); + fp = read_open( path ); - if ( ( entry = bed_entry_get( fp, cols ) ) != NULL ) + if ( ( bed_entry_get( fp, &entry ) ) ) { - node->val = entry; + node->val = mem_clone( entry, sizeof( bed_entry ) ); list_sl_add_beg( &list, &node ); old_node = node; } - while ( ( entry = bed_entry_get( fp, cols ) ) != NULL ) + while ( ( bed_entry_get( fp, &entry ) ) ) { node = node_sl_new(); - node->val = entry; + node->val = mem_clone( entry, sizeof( bed_entry ) ); list_sl_add_after( &old_node, &node ); diff --git a/code_c/Maasha/src/test/test_barray.c b/code_c/Maasha/src/test/test_barray.c index b6e100d..10aa9fe 100644 --- a/code_c/Maasha/src/test/test_barray.c +++ b/code_c/Maasha/src/test/test_barray.c @@ -123,15 +123,20 @@ void test_barray_interval_scan() ba = barray_new( nmemb ); + barray_interval_inc( ba, 1, 2, 1 ); + barray_interval_inc( ba, 4, 5, 1 ); + +/* barray_interval_inc( ba, 0, 0, 3 ); barray_interval_inc( ba, 0, 3, 3 ); barray_interval_inc( ba, 9, 9, 3 ); - barray_interval_inc( ba, 99, 100, 111 ); + barray_interval_inc( ba, 11, 11, 3 ); barray_interval_inc( ba, 19, 29, 3 ); barray_interval_inc( ba, 25, 35, 2 ); +*/ while ( barray_interval_scan( ba, &pos, &beg, &end ) ) { -// printf( "pos: %zu beg: %zu end: %zu\n", pos, beg, end ); +// printf( "beg: %zu end: %zu\n", beg, end ); } // barray_print( ba ); diff --git a/code_c/Maasha/src/test/test_hash.c b/code_c/Maasha/src/test/test_hash.c index 25d9d22..563b05f 100644 --- a/code_c/Maasha/src/test/test_hash.c +++ b/code_c/Maasha/src/test/test_hash.c @@ -9,6 +9,7 @@ static void test_hash_key(); static void test_hash_add(); static void test_hash_get(); static void test_hash_elem_get(); +static void test_hash_each(); static void test_hash_destroy(); static void test_hash_print(); static void test_hash_collision_stats(); @@ -22,6 +23,7 @@ int main() test_hash_add(); test_hash_get(); test_hash_elem_get(); + test_hash_each(); test_hash_destroy(); test_hash_print(); test_hash_collision_stats(); @@ -135,6 +137,46 @@ void test_hash_elem_get() } +void test_hash_each() +{ + fprintf( stderr, " Testing hash_each ... " ); + + hash *hash_pt = NULL; + size_t size = 8; + size_t i = 0; + char *key = NULL; + char *val = "val"; + char *key0 = NULL; + char *val0 = NULL; + + key = mem_get_zero( 50 ); + + hash_pt = hash_new( size ); + + for ( i = 0; i < ( 1 << size ); i++ ) + { + sprintf( key, "key_%zu", i ); + + hash_add( hash_pt, key, val ); + } + + assert( hash_pt->index_table == 0 ); + assert( hash_pt->index_bucket == NULL ); + + hash_print( hash_pt ); + + while( hash_each( hash_pt, &key0, &val0 ) ) + { + printf( "1: key0: %s val0: %s\n", key0, ( char * ) val0 ); + printf( "index_table: %zu index_bucket: %p\n", hash_pt->index_table, hash_pt->index_bucket ); + + hash_each( hash_pt, &key0, &val0 ); + } + + fprintf( stderr, "OK\n" ); +} + + void test_hash_destroy() { fprintf( stderr, " Testing hash_destroy ... " ); diff --git a/code_c/Maasha/src/test/test_ucsc.c b/code_c/Maasha/src/test/test_ucsc.c index 7153503..95c4a0b 100644 --- a/code_c/Maasha/src/test/test_ucsc.c +++ b/code_c/Maasha/src/test/test_ucsc.c @@ -3,6 +3,7 @@ #include "list.h" #include "ucsc.h" +static void test_bed_entry_new(); static void test_bed_entry_get(); static void test_bed_entries_get(); static void test_bed_entries_destroy(); @@ -17,6 +18,7 @@ int main() { fprintf( stderr, "Running all tests for ucsc.c\n" ); + test_bed_entry_new(); test_bed_entry_get(); test_bed_entries_get(); test_bed_entries_destroy(); @@ -32,6 +34,22 @@ int main() } +void test_bed_entry_new() +{ + fprintf( stderr, " Testing bed_entry_new ... " ); + + bed_entry *entry = NULL; + + entry = bed_entry_new( 3 ); + + assert( entry->cols == 3 ); + assert( entry->chr_beg == 0 ); + assert( entry->chr_end == 0 ); + + fprintf( stderr, "OK\n" ); +} + + void test_bed_entry_get() { fprintf( stderr, " Testing bed_entry_get ... " ); @@ -42,7 +60,9 @@ void test_bed_entry_get() fp = read_open( path ); - while ( ( entry = bed_entry_get( fp, 12 ) ) != NULL ) + entry = bed_entry_new( 12 ); + + while ( ( bed_entry_get( fp, &entry ) ) ) { // bed_entry_put( entry, 3 ); } @@ -60,11 +80,11 @@ void test_bed_entries_get() char *path = "test/test_files/test12.bed"; list_sl *entries = NULL; - entries = bed_entries_get( path, 0 ); + entries = bed_entries_get( path, 3 ); -// bed_entries_put( entries, 0 ); +// bed_entries_put( entries, 3 ); - fprintf( stderr, "OK\n" ); + fprintf( stderr, "BAD!!!\n" ); } @@ -75,13 +95,13 @@ void test_bed_entries_destroy() char *path = "test/test_files/test12.bed"; list_sl *entries = NULL; - entries = bed_entries_get( path, 0 ); + entries = bed_entries_get( path, 3 ); - bed_entries_destroy( &entries ); +// bed_entries_destroy( &entries ); - assert( entries == NULL ); +// assert( entries == NULL ); - fprintf( stderr, "OK\n" ); + fprintf( stderr, "BAD!!!\n" ); } diff --git a/code_c/Maasha/src/testall.pl b/code_c/Maasha/src/testall.pl index 988690a..eb0ce67 100755 --- a/code_c/Maasha/src/testall.pl +++ b/code_c/Maasha/src/testall.pl @@ -16,6 +16,7 @@ $test_dir = "test"; test_mem test_seq test_strings + test_ucsc ); print STDERR "\nRunning all unit tests:\n\n";