From: martinahansen Date: Thu, 11 Sep 2008 09:09:20 +0000 (+0000) Subject: added bed stuff to c code X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=c0bb10db496a257427e5cc3c2a38a2f0941b10ee;p=biopieces.git added bed stuff to c code git-svn-id: http://biopieces.googlecode.com/svn/trunk@255 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/inc/strings.h b/code_c/Maasha/src/inc/strings.h index 61c72ca..c757e57 100644 --- a/code_c/Maasha/src/inc/strings.h +++ b/code_c/Maasha/src/inc/strings.h @@ -7,6 +7,9 @@ size_t chop( char *string ); /* Returns the length of the chomped string or -1 is no newline was found. */ size_t chomp( char *string ); +/* Returns the total number of a given char in a given string. */ +size_t strchr_total( const char *string, const char c ); + /* Locate a substr in a str starting at pos allowing for a given number of mismatches. */ /* Returns position of match begin or -1 if not found. */ size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch ); diff --git a/code_c/Maasha/src/inc/ucsc.h b/code_c/Maasha/src/inc/ucsc.h index 64e92d0..0ca029d 100644 --- a/code_c/Maasha/src/inc/ucsc.h +++ b/code_c/Maasha/src/inc/ucsc.h @@ -1,21 +1,20 @@ /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */ -#define BED_BUFFER 1024 +#define BED_BUFFER 2048 +#define BED_CHR_MAX 64 +#define BED_QID_MAX 256 +#define BED_ITEMRGB_MAX 16 +#define BED_BLOCKSIZES_MAX 512 +#define BED_QBEGS_MAX 512 -struct bed_entry3 -{ - char *chr; - uint chr_beg; - uint chr_end; -}; - -struct bed_entry12 +struct _bed_entry { + int cols; char *chr; uint chr_beg; uint chr_end; char *q_id; - float score; + uint score; char strand; uint thick_beg; uint thick_end; @@ -25,6 +24,11 @@ struct bed_entry12 char *q_begs; }; -void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols ); -void bed_split( char *string, struct bed_entry12 *bed, int cols ); +typedef struct _bed_entry bed_entry; +bed_entry *bed_entry_new( const int cols ); +bed_entry *bed_entry_get( FILE *fp, const int cols ); +list_sl *bed_entries_get( char *path, const int cols ); +void bed_entry_put( bed_entry *entry, int cols ); +void bed_entries_put( list_sl *entries, int cols ); +int cmp_bed3_entries_sort( const void *a, const void *b ); diff --git a/code_c/Maasha/src/lib/strings.c b/code_c/Maasha/src/lib/strings.c index 6219d36..2b4daec 100644 --- a/code_c/Maasha/src/lib/strings.c +++ b/code_c/Maasha/src/lib/strings.c @@ -8,8 +8,6 @@ size_t chop( char *string ) { /* Martin A. Hansen, June 2008 */ - /* Unit test done.*/ - /* Remove the last char from a string. */ /* Returns the length of the chopped string.*/ @@ -30,8 +28,6 @@ size_t chomp( char *string ) { /* Martin A. Hansen, June 2008 */ - /* Unit test done.*/ - /* Removes the last char from a string if the char is a newline. */ /* Returns the length of the chomped string or -1 is no newline was found. */ @@ -55,11 +51,26 @@ size_t chomp( char *string ) } -size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch ) +size_t strchr_total( const char *string, const char c ) { - /* Martin A. Hansen, August 2008. */ + /* Martin A. Hansen, September 2008 */ + + /* Returns the total number of a given char in a given string. */ - /* Unit test done.*/ + int count[ 256 ] = { 0 }; /* Integer array spanning the ASCII alphabet */ + int i; + + for ( i = 0; i < strlen( string ); i++ ) { + count[ ( int ) string[ i ] ]++; + } + + return count[ ( int ) c ]; +} + + +size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch ) +{ + /* Martin A. Hansen, August 2008 */ /* Locate a substr in a str starting at pos allowing for a given number of mismatches. */ /* Returns position of match begin or -1 if not found. */ @@ -109,9 +120,7 @@ size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t size_t match_substr_rev( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch ) { - /* Martin A. Hansen, August 2008. */ - - /* Unit test done.*/ + /* Martin A. Hansen, August 2008 */ /* Locate a substr in a str backwards starting at the end of */ /* str minus pos allowing for a given number of mismatches. */ @@ -159,3 +168,5 @@ size_t match_substr_rev( size_t pos, char *str, size_t str_len, char *substr, si return -1; } + + diff --git a/code_c/Maasha/src/lib/ucsc.c b/code_c/Maasha/src/lib/ucsc.c index 9231069..6fab258 100644 --- a/code_c/Maasha/src/lib/ucsc.c +++ b/code_c/Maasha/src/lib/ucsc.c @@ -1,31 +1,291 @@ /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */ + #include "common.h" #include "mem.h" +#include "filesys.h" +#include "list.h" +#include "strings.h" #include "ucsc.h" -void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols ) +bed_entry *bed_entry_new( const int cols ) +{ + bed_entry *entry = mem_get( sizeof( bed_entry ) ); + + entry->cols = cols; + entry->chr = mem_get( BED_CHR_MAX ); + entry->chr_beg = 0; + entry->chr_end = 0; + + if ( cols == 3 ) { + return entry; + } + + entry->q_id = mem_get( BED_QID_MAX ); + + if ( cols == 4 ) { + return entry; + } + + entry->score = 0; + + if ( cols == 5 ) { + return entry; + } + + entry->strand = 0; + + if ( cols == 6 ) { + return entry; + } + + entry->thick_beg = 0; + entry->thick_end = 0; + entry->itemrgb = mem_get( BED_ITEMRGB_MAX ); + entry->blockcount = 0;; + entry->blocksizes = mem_get( BED_BLOCKSIZES_MAX ); + entry->q_begs = mem_get( BED_QBEGS_MAX ); + + return entry; +} + + +bed_entry *bed_entry_get( FILE *fp, int cols ) { - /* Martin A. Hansen, June 2008 */ + bed_entry *entry = bed_entry_new( cols ); + char buffer[ BED_BUFFER ]; + + assert( cols == 0 || cols == 3 || cols == 4 || cols == 5 || cols == 6 || cols == 12 ); + + if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) + { + if ( ! cols ) + { + cols = 1 + strchr_total( buffer, '\t' ); + entry->cols = cols; + } + + if ( cols == 3 ) + { + sscanf( + buffer, + "%s\t%u\t%u", + entry->chr, + &entry->chr_beg, + &entry->chr_end + ); + + return entry; + } + + if ( cols == 4 ) + { + sscanf( + buffer, + "%s\t%u\t%u\t%s", + entry->chr, + &entry->chr_beg, + &entry->chr_end, + entry->q_id + ); + + return entry; + } + + if ( cols == 5 ) + { + sscanf( + buffer, + "%s\t%u\t%u\t%s\t%u", + entry->chr, + &entry->chr_beg, + &entry->chr_end, + entry->q_id, + &entry->score + ); - /* Get next 3 column bed entry from stream. */ + return entry; + } - char bed_buffer[ BED_BUFFER ]; - struct bed_entry12 *bed12 = NULL; + if ( cols == 6 ) + { + sscanf( + buffer, + "%s\t%u\t%u\t%s\t%u\t%c", + entry->chr, + &entry->chr_beg, + &entry->chr_end, + entry->q_id, + &entry->score, + &entry->strand + ); - bed12 = mem_get( sizeof( bed12 ) ); + return entry; + } + + if ( cols == 12 ) + { + sscanf( + buffer, + "%s\t%u\t%u\t%s\t%u\t%c\t%u\t%u\t%s\t%u\t%s\t%s", + entry->chr, + &entry->chr_beg, + &entry->chr_end, + entry->q_id, + &entry->score, + &entry->strand, + &entry->thick_beg, + &entry->thick_end, + entry->itemrgb, + &entry->blockcount, + entry->blocksizes, + entry->q_begs + ); + + return entry; + } + } + + return NULL; +} + + +list_sl *bed_entries_get( char *path, const int cols ) +{ + list_sl *list = list_sl_new(); + node_sl *node = node_sl_new(); + node_sl *old_node = NULL; + bed_entry *entry = NULL; + FILE *fp = NULL; + + fp = read_open( path ); - if ( ( fgets( bed_buffer, sizeof( bed_buffer ), fp ) != NULL ) ) + if ( ( entry = bed_entry_get( fp, cols ) ) != NULL ) { - printf( "buffer: %s\n", bed_buffer ); + node->val = entry; - bed_split( bed_buffer, bed12, 3 ); + list_sl_add_beg( &list, &node ); - return; + old_node = node; } -// return NULL; + while ( ( entry = bed_entry_get( fp, cols ) ) != NULL ) + { + node = node_sl_new(); + + node->val = entry; + + list_sl_add_after( &old_node, &node ); + + old_node = node; + } + + close_stream( fp ); + + return list; +} + + +void bed_entry_put( bed_entry *entry, int cols ) +{ + if ( ! cols ) { + cols = entry->cols; + } + + if ( cols == 3 ) + { + printf( + "%s\t%u\t%u\n", + entry->chr, + entry->chr_beg, + entry->chr_end + ); + } + else if ( cols == 4 ) + { + printf( + "%s\t%u\t%u\t%s\n", + entry->chr, + entry->chr_beg, + entry->chr_end, + entry->q_id + ); + } + else if ( cols == 5 ) + { + printf( + "%s\t%u\t%u\t%s\t%u\n", + entry->chr, + entry->chr_beg, + entry->chr_end, + entry->q_id, + entry->score + ); + } + else if ( cols == 6 ) + { + printf( + "%s\t%u\t%u\t%s\t%u\t%c\n", + entry->chr, + entry->chr_beg, + entry->chr_end, + entry->q_id, + entry->score, + entry->strand + ); + } + else if ( cols == 12 ) + { + printf( + "%s\t%u\t%u\t%s\t%u\t%c\t%u\t%u\t%s\t%u\t%s\t%s\n", + entry->chr, + entry->chr_beg, + entry->chr_end, + entry->q_id, + entry->score, + entry->strand, + entry->thick_beg, + entry->thick_end, + entry->itemrgb, + entry->blockcount, + entry->blocksizes, + entry->q_begs + ); + } + else + { + fprintf( stderr, "ERROR: Wrong number of columns in bed_entry_put: %d\n", cols ); + + abort(); + } +} + + +void bed_entries_put( list_sl *entries, int cols ) +{ + node_sl *node = NULL; + + for ( node = entries->first; node != NULL; node = node->next ) { + bed_entry_put( ( bed_entry * ) node->val, cols ); + } } +int cmp_bed3_entries_sort( const void *a, const void *b ) +{ + node_sl *a_node = *( ( node_sl ** ) a ); + node_sl *b_node = *( ( node_sl ** ) b ); + + bed_entry *a_entry = ( bed_entry * ) a_node->val; + bed_entry *b_entry = ( bed_entry * ) b_node->val; + + if ( a_entry->chr_end < b_entry->chr_end ) { + return 1; + } else if ( a_entry->chr_end > b_entry->chr_end ) { + return -1; + } else { + return 0; + } +} + diff --git a/code_c/Maasha/src/test/test_files/test12.bed b/code_c/Maasha/src/test/test_files/test12.bed new file mode 100644 index 0000000..7e60b26 --- /dev/null +++ b/code_c/Maasha/src/test/test_files/test12.bed @@ -0,0 +1,10 @@ +chr4 31176 31602 AA695812 0 - 31176 31602 0 1 426, 0, +chr4 44448 44874 AA695812 0 - 44448 44874 0 1 426, 0, +chr4 50522 50841 AA142091 0 - 50522 50841 0 2 81,237, 0,82, +chr4 57489 57808 AA142091 0 - 57489 57808 0 2 81,237, 0,82, +chr4 59352 59778 AA695812 0 - 59352 59778 0 1 426, 0, +chr4 63580 64332 AA979544 0 - 63580 64332 0 1 752, 0, +chr4 63710 64332 AA979534 0 - 63710 64332 0 3 111,481,30, 0,111,592, +chr4 70946 71196 AA699063 0 - 70946 71196 0 2 142,55, 0,195, +chr4 72831 76893 AA264101 0 - 72831 76893 0 2 179,437, 0,3625, +chr4 72872 76630 AA694817 0 - 72872 76630 0 3 83,54,174, 0,84,3584, diff --git a/code_c/Maasha/src/test/test_files/test3.bed b/code_c/Maasha/src/test/test_files/test3.bed new file mode 100644 index 0000000..83f7135 --- /dev/null +++ b/code_c/Maasha/src/test/test_files/test3.bed @@ -0,0 +1,10 @@ +chr4 31176 31602 +chr4 44448 44874 +chr4 50522 50841 +chr4 57489 57808 +chr4 59352 59778 +chr4 63580 64332 +chr4 63710 64332 +chr4 70946 71196 +chr4 72831 76893 +chr4 72872 76630 diff --git a/code_c/Maasha/src/test/test_files/test4.bed b/code_c/Maasha/src/test/test_files/test4.bed new file mode 100644 index 0000000..151a394 --- /dev/null +++ b/code_c/Maasha/src/test/test_files/test4.bed @@ -0,0 +1,10 @@ +chr4 31176 31602 AA695812 +chr4 44448 44874 AA695812 +chr4 50522 50841 AA142091 +chr4 57489 57808 AA142091 +chr4 59352 59778 AA695812 +chr4 63580 64332 AA979544 +chr4 63710 64332 AA979534 +chr4 70946 71196 AA699063 +chr4 72831 76893 AA264101 +chr4 72872 76630 AA694817 diff --git a/code_c/Maasha/src/test/test_files/test5.bed b/code_c/Maasha/src/test/test_files/test5.bed new file mode 100644 index 0000000..d2c6da0 --- /dev/null +++ b/code_c/Maasha/src/test/test_files/test5.bed @@ -0,0 +1,10 @@ +chr4 31176 31602 AA695812 0 +chr4 44448 44874 AA695812 0 +chr4 50522 50841 AA142091 0 +chr4 57489 57808 AA142091 0 +chr4 59352 59778 AA695812 0 +chr4 63580 64332 AA979544 0 +chr4 63710 64332 AA979534 0 +chr4 70946 71196 AA699063 0 +chr4 72831 76893 AA264101 0 +chr4 72872 76630 AA694817 0 diff --git a/code_c/Maasha/src/test/test_files/test6.bed b/code_c/Maasha/src/test/test_files/test6.bed new file mode 100644 index 0000000..eff743a --- /dev/null +++ b/code_c/Maasha/src/test/test_files/test6.bed @@ -0,0 +1,10 @@ +chr4 31176 31602 AA695812 0 - +chr4 44448 44874 AA695812 0 - +chr4 50522 50841 AA142091 0 - +chr4 57489 57808 AA142091 0 - +chr4 59352 59778 AA695812 0 - +chr4 63580 64332 AA979544 0 - +chr4 63710 64332 AA979534 0 - +chr4 70946 71196 AA699063 0 - +chr4 72831 76893 AA264101 0 - +chr4 72872 76630 AA694817 0 - diff --git a/code_c/Maasha/src/test/test_strings.c b/code_c/Maasha/src/test/test_strings.c index 30d04b8..17840ca 100644 --- a/code_c/Maasha/src/test/test_strings.c +++ b/code_c/Maasha/src/test/test_strings.c @@ -3,6 +3,7 @@ static void test_chop(); static void test_chomp(); +static void test_strchr_total(); static void test_match_substr(); static void test_match_substr_rev(); @@ -19,6 +20,7 @@ int main() test_chop(); test_chomp(); + test_strchr_total(); test_match_substr(); test_match_substr_rev(); @@ -60,6 +62,19 @@ static void test_chomp() } +static void test_strchr_total() +{ + fprintf( stderr, " Testing strchr_total ... " ); + + char *str = "X-----X----X"; + + assert( strchr_total( str, 'X' ) == 3 ); + assert( strchr_total( str, '-' ) == 9 ); + + fprintf( stderr, "OK\n" ); +} + + static void test_match_substr() { fprintf( stderr, " Testing match_substr ... " ); diff --git a/code_c/Maasha/src/test/test_ucsc.c b/code_c/Maasha/src/test/test_ucsc.c new file mode 100644 index 0000000..2ef6220 --- /dev/null +++ b/code_c/Maasha/src/test/test_ucsc.c @@ -0,0 +1,76 @@ +#include "common.h" +#include "filesys.h" +#include "list.h" +#include "ucsc.h" + +static void test_bed_entry_get(); +static void test_bed_entries_get(); +static void test_bed_entries_sort(); + + +int main() +{ + fprintf( stderr, "Running all tests for ucsc.c\n" ); + + test_bed_entry_get(); + test_bed_entries_get(); + test_bed_entries_sort(); + + fprintf( stderr, "Done\n\n" ); + + return EXIT_SUCCESS; +} + + +void test_bed_entry_get() +{ + fprintf( stderr, " Testing bed_entry_get ... " ); + + char *path = "test/test_files/test12.bed"; + FILE *fp = NULL; + bed_entry *entry = NULL; + + fp = read_open( path ); + + while ( ( entry = bed_entry_get( fp, 12 ) ) != NULL ) + { +// bed_entry_put( entry, 3 ); + } + + close_stream( fp ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_entries_get() +{ + fprintf( stderr, " Testing bed_entries_get ... " ); + + char *path = "test/test_files/test12.bed"; + list_sl *entries = NULL; + + entries = bed_entries_get( path, 0 ); + + bed_entries_put( entries, 0 ); + + fprintf( stderr, "OK\n" ); +} + + +void test_bed_entries_sort() +{ + fprintf( stderr, " Testing bed_entries_sort ... " ); + + char *path = "test/test_files/test12.bed"; + list_sl *entries = NULL; + + entries = bed_entries_get( path, 0 ); + + list_sl_sort( &entries, cmp_bed3_entries_sort ); + + bed_entries_put( entries, 0 ); + + fprintf( stderr, "OK\n" ); +} +