From a2d28332eb202ee22207b9875986dca8cfa2314b Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 5 Sep 2008 07:17:33 +0000 Subject: [PATCH] bipartite_scan now working git-svn-id: http://biopieces.googlecode.com/svn/trunk@247 74ccb610-7750-0410-82ae-013aeee3265d --- code_c/Maasha/src/bipartite_scan.c | 279 ++++++++++++++++++----------- code_c/Maasha/src/inc/common.h | 1 + code_c/Maasha/src/test/test_mem.c | 2 - 3 files changed, 176 insertions(+), 106 deletions(-) diff --git a/code_c/Maasha/src/bipartite_scan.c b/code_c/Maasha/src/bipartite_scan.c index 2b06bec..c317445 100644 --- a/code_c/Maasha/src/bipartite_scan.c +++ b/code_c/Maasha/src/bipartite_scan.c @@ -5,26 +5,27 @@ #include "fasta.h" #include "list.h" -#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */ -#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */ -#define BITS_IN_BYTE 8 /* number of bits in one byte. */ -#define BLOCK_SPACE_MAX 256 /* maximum space between two blocks. */ -// #define COUNT_ARRAY_SIZE ( 1 << 30 ) /* size of the unsigned int count array. */ -#define COUNT_ARRAY_SIZE ( 1 << 5 ) /* size of the unsigned int count array. */ - -#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ -#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ -#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ -#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ - +#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */ +#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */ +#define BITS_IN_BYTE 8 /* number of bits in one byte. */ +#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */ +#define COUNT_ARRAY_SIZE ( sizeof( uint ) * ( 1 << 30 ) ) /* size of the unsigned int count array. */ + +#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ +#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ +#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ +#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ + +/* Structure that will hold one tetra nucleotide block. */ struct _bitblock { - uchar bin; - bool hasN; + uchar bin; /* Tetra nucleotide encoded binary. */ + bool hasN; /* Flag indicating any N's in the block. */ }; typedef struct _bitblock bitblock; +/* Byte array for fast convertion of binary blocks back to DNA. */ char *bin2dna[256] = { "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT", "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT", @@ -60,19 +61,23 @@ char *bin2dna[256] = { "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT" }; +/* Function declarations. */ void run_scan( int argc, char *argv[] ); void print_usage(); void scan_file( char *file, seq_entry *entry, uint *count_array ); +uint *count_array_new( size_t size ); void scan_seq( char *seq, size_t seq_len, uint *count_array ); void scan_list( list_sl *list, uint *count_array ); bitblock *bitblock_new(); -void bitblock_print( bitblock *out ); -void bitblock_list_print( list_sl *list ); -uint blocks2motif( uchar bin1, uchar bin2, short dist ); +uint blocks2motif( uchar bin1, uchar bin2, ushort dist ); void count_array_print( uint *count_array ); void motif_print( uint motif, uint count ); +void bitblock_list_print( list_sl *list ); +void bitblock_print( bitblock *out ); +/* Unit test declarations. */ static void run_tests(); +static void test_count_array_new(); static void test_bitblock_new(); static void test_bitblock_print(); static void test_bitblock_list_print(); @@ -80,6 +85,9 @@ static void test_scan_seq(); static void test_blocks2motif(); +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ + + int main( int argc, char *argv[] ) { if ( argc == 1 ) { @@ -93,8 +101,15 @@ int main( int argc, char *argv[] ) } +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ + + void print_usage() { + /* Martin A. Hansen, September 2008 */ + + /* Print usage and exit if no files in argument. */ + fprintf( stderr, "Usage: bipartite_scam > result.csv\n" ); @@ -107,14 +122,16 @@ void run_scan( int argc, char *argv[] ) { /* Martin A. Hansen, September 2008 */ - /* Scan a stack of files. */ + /* For each file in argv scan the file for */ + /* bipartite motifs and output the motifs */ + /* and their count. */ char *file = NULL; int i = 0; seq_entry *entry = NULL; uint *count_array = NULL; - count_array = mem_get_zero( COUNT_ARRAY_SIZE ); + count_array = count_array_new( COUNT_ARRAY_SIZE ); entry = seq_new( MAX_SEQ_NAME, MAX_SEQ ); @@ -141,26 +158,32 @@ void run_scan( int argc, char *argv[] ) } +uint *count_array_new( size_t size ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Initialize a new zeroed uint array of a given byte size. */ + + uint *array = NULL; + + array = mem_get_zero( size ); + + return array; +} + + void scan_file( char *file, seq_entry *entry, uint *count_array ) { /* Martin A. Hansen, September 2008 */ - /* Scan all entries of a file in both */ - /* sense and anti-sense directions .*/ + /* Scan all FASTA entries of a file in both */ + /* sense and anti-sense directions. */ FILE *fp = read_open( file ); while ( fasta_get_entry( fp, &entry ) == TRUE ) { - fprintf( stderr, " + Scanning: %s ... ", entry->seq_name ); - - scan_seq( entry->seq, entry->seq_len, count_array ); - - fprintf( stderr, "done.\n" ); - - revcomp_dna( entry->seq ); - - fprintf( stderr, " - Scanning: %s ... ", entry->seq_name ); + fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len ); scan_seq( entry->seq, entry->seq_len, count_array ); @@ -171,54 +194,21 @@ void scan_file( char *file, seq_entry *entry, uint *count_array ) } -bitblock *bitblock_new() -{ - /* Martin A. Hansen, September 2008 */ - - /* Initializes a new block. */ - - bitblock *new_block = NULL; - - new_block = mem_get( sizeof( bitblock ) ); - - new_block->bin = 0; - new_block->hasN = FALSE; - - return new_block; -} - - -void bitblock_print( bitblock *out ) -{ - /* Martin A. Hansen, September 2008 */ - - /* Debug function to print a given block. */ - - printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN ); -} - - -void bitblock_list_print( list_sl *list ) -{ - /* Martin A. Hansen, September 2008 */ - - /* Debug function to print all blocks in a list. */ - - node_sl *node = NULL; - - for ( node = list->first; node != NULL; node = node->next ) { - bitblock_print( ( bitblock * ) node->val ); - } -} - - void scan_seq( char *seq, size_t seq_len, uint *count_array ) { /* Martin A. Hansen, September 2008 */ + + /* Run a sliding window over a given sequence. The window */ + /* consists of a list where new blocks of 4 nucleotides */ + /* are pushed onto one end while at the same time old */ + /* blocks are popped from the other end. The number of */ + /* in the list is determined by the maximum seperator. */ + /* Everytime we have a full window, the window is scanned */ + /* for motifs. */ bitblock *block = NULL; - short b_count = 0; - short n_count = 0; + size_t b_count = 0; + ushort n_count = 0; size_t i = 0; uchar bin = 0; bool first_node = TRUE; @@ -268,7 +258,7 @@ void scan_seq( char *seq, size_t seq_len, uint *count_array ) if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT ) { // bitblock_list_print( list ); - + scan_list( list, count_array ); list_sl_remove_beg( &list ); @@ -284,12 +274,17 @@ void scan_list( list_sl *list, uint *count_array ) { /* Martin A. Hansen, September 2008 */ + /* Scan a list of blocks for biparite motifs by creating */ + /* a binary motif consisting of two blocks of 4 nucleotides */ + /* along with the distance separating them. Motifs containing */ + /* N's are skipped. */ + node_sl *first_node = NULL; node_sl *next_node = NULL; bitblock *block1 = NULL; bitblock *block2 = NULL; int i = 0; - short dist = 0; + ushort dist = 0; uint motif_bin = 0; // bitblock_list_print( list ); @@ -310,13 +305,13 @@ void scan_list( list_sl *list, uint *count_array ) { block2 = ( bitblock * ) next_node->val; -// printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); + // printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */ if ( ! block2->hasN ) { motif_bin = blocks2motif( block1->bin, block2->bin, dist ); - // motif_print( motif_bin ); +// motif_print( motif_bin, 0 ); /* DEBUG */ count_array[ motif_bin ]++; } @@ -327,10 +322,31 @@ void scan_list( list_sl *list, uint *count_array ) } -uint blocks2motif( uchar bin1, uchar bin2, short dist ) +bitblock *bitblock_new() { /* Martin A. Hansen, September 2008 */ + /* Initializes a new empty bitblock. */ + + bitblock *new_block = NULL; + + new_block = mem_get( sizeof( bitblock ) ); + + new_block->bin = 0; + new_block->hasN = FALSE; + + return new_block; +} + + +uint blocks2motif( uchar bin1, uchar bin2, ushort dist ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Given two binary encoded tetra nuceotide blocks, */ + /* and the distance separating this, create a binary */ + /* bipartite motif. */ + uint motif = 0; motif |= bin1; @@ -339,7 +355,7 @@ uint blocks2motif( uchar bin1, uchar bin2, short dist ) motif |= bin2; - motif <<= sizeof( short ) * BITS_IN_BYTE; + motif <<= sizeof( uchar ) * BITS_IN_BYTE; motif |= dist; @@ -351,19 +367,21 @@ void count_array_print( uint *count_array ) { /* Martin A. Hansen, Seqptember 2008. */ - /* Print all motifs in count_array as */ + /* Print all bipartite motifs in count_array as */ /* tabular output. */ uint i = 0; uint motif = 0; uint count = 0; - for ( i = 0; i < COUNT_ARRAY_SIZE; i++ ) + for ( i = 0; i < COUNT_ARRAY_SIZE / 4; i++ ) { motif = i; count = count_array[ i ]; - motif_print( motif, count ); + if ( count > 0 ) { + motif_print( motif, count ); + } } } @@ -372,13 +390,18 @@ void motif_print( uint motif, uint count ) { /* Martin A. Hansen, September 2008 */ - uchar bin1 = 0; - uchar bin2 = 0; - short dist = 0; + /* Converts a binary encoded bipartite motif */ + /* into DNA and output the motif, distance and */ + /* count seperated by tabs: */ + /* BLOCK1 \t BLOCK2 \t DIST \t COUNT */ + + uchar bin1 = 0; + uchar bin2 = 0; + ushort dist = 0; - dist = ( short ) motif; + dist = ( ushort ) motif & 255; - motif >>= sizeof( short ) * BITS_IN_BYTE; + motif >>= sizeof( uchar ) * BITS_IN_BYTE; bin2 = ( uchar ) motif; @@ -390,39 +413,86 @@ void motif_print( uint motif, uint count ) } +void bitblock_list_print( list_sl *list ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Debug function to print all blocks in a list. */ + + node_sl *node = NULL; + + printf( "\nbitblock_list_print:\n" ); + + for ( node = list->first; node != NULL; node = node->next ) { + bitblock_print( ( bitblock * ) node->val ); + } +} + + +void bitblock_print( bitblock *out ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Debug function to print a given block. */ + + printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN ); +} + + /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ void run_tests() { - printf( "Running tests\n" ); + fprintf( stderr, "Running tests\n" ); + test_count_array_new(); test_bitblock_new(); test_bitblock_print(); test_bitblock_list_print(); test_scan_seq(); test_blocks2motif(); - printf( "All tests OK\n" ); + fprintf( stderr, "All tests OK\n" ); +} + + +void test_count_array_new() +{ + fprintf( stderr, " Running test_count_array_new ... " ); + + size_t i = 0; + size_t size = 128; + uint *array = NULL; + + array = count_array_new( 4 * size ); + + for ( i = 0; i < size; i++ ) { + assert( array[ i ] == 0 ); + } + + mem_free( &array ); + + fprintf( stderr, "done.\n" ); } void test_bitblock_new() { - printf( " Running test_bitblock_new ... " ); + fprintf( stderr, " Running test_bitblock_new ... " ); bitblock *new_block = bitblock_new(); assert( new_block->bin == 0 ); assert( new_block->hasN == FALSE ); - printf( "done.\n"); + fprintf( stderr, "done.\n" ); } void test_bitblock_print() { - printf( " Running test_bitblock_print ... " ); + fprintf( stderr, " Running test_bitblock_print ... " ); bitblock *new_block = bitblock_new(); @@ -431,13 +501,13 @@ void test_bitblock_print() // bitblock_print( new_block ); - printf( "done.\n"); + fprintf( stderr, "done.\n"); } void test_bitblock_list_print() { - printf( " Running test_bitblock_list_print ... " ); + fprintf( stderr, " Running test_bitblock_list_print ... " ); list_sl *list = list_sl_new(); node_sl *node1 = node_sl_new(); @@ -464,39 +534,40 @@ void test_bitblock_list_print() // bitblock_list_print( list ); - printf( "done.\n" ); + fprintf( stderr, "done.\n" ); } void test_scan_seq() { - printf( " Running test_scan_seq ... " ); + fprintf( stderr, " Running test_scan_seq ... " ); //char *seq = "AAAANTCGGCTNGGGG"; - char *seq = "AAAATCGGCTGGGG"; + //char *seq = "AAAATCGGCTGGGG"; + char *seq = "AAAAAAAAAAAAAAA"; size_t seq_len = strlen( seq ); uint *count_array = mem_get_zero( sizeof( uint ) * ( 1 << 5 ) ); scan_seq( seq, seq_len, count_array ); - printf( "done.\n"); + fprintf( stderr, "done.\n" ); } static void test_blocks2motif() { - printf( " Running test_blocks2motif ... " ); + fprintf( stderr, " Running test_blocks2motif ... " ); - uchar bin1 = 4; - uchar bin2 = 3; - short dist = 256; - uint motif = 0; + uchar bin1 = 4; + uchar bin2 = 3; + ushort dist = 256; + uint motif = 0; motif = blocks2motif( bin1, bin2, dist ); // printf( "motif: %d\n", motif ); - printf( "done.\n"); + fprintf( stderr, "done.\n"); } diff --git a/code_c/Maasha/src/inc/common.h b/code_c/Maasha/src/inc/common.h index 3cfc15b..453e80d 100644 --- a/code_c/Maasha/src/inc/common.h +++ b/code_c/Maasha/src/inc/common.h @@ -10,6 +10,7 @@ #include typedef unsigned char uchar; +typedef unsigned short ushort; typedef char bool; #define TRUE 1 diff --git a/code_c/Maasha/src/test/test_mem.c b/code_c/Maasha/src/test/test_mem.c index 6f4f541..b5dbd65 100644 --- a/code_c/Maasha/src/test/test_mem.c +++ b/code_c/Maasha/src/test/test_mem.c @@ -96,8 +96,6 @@ void test_mem_resize_zero() memset( pt, '1', size_before ); - assert( strlen( pt ) == size_before ); - pt = mem_resize_zero( pt, size_before, size_after ); assert( strlen( pt ) == size_before ); -- 2.39.5