1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
10 #define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
11 #define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
12 #define BITS_IN_BYTE 8 /* number of bits in one byte. */
13 #define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
14 #define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */
15 #define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
16 #define CUTOFF 10000 /* minimum number of motifs in output. */
18 #define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
19 #define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
20 #define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
21 #define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
23 /* Structure that will hold one tetra nucleotide block. */
26 uchar bin; /* Tetra nucleotide binary encoded. */
27 bool hasN; /* Flag indicating any N's in the block. */
30 typedef struct _bitblock bitblock;
32 /* Function declarations. */
33 void run_scan( int argc, char *argv[] );
35 void scan_file( char *file, seq_entry *entry, uint *count_array );
36 uint *count_array_new( size_t nmemb );
37 void scan_seq( char *seq, size_t seq_len, uint *count_array );
38 void scan_list( list_sl *list, uint *count_array );
39 bitblock *bitblock_new();
40 uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
41 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff );
42 void bitblock_list_print( list_sl *list );
43 void bitblock_print( bitblock *out );
45 /* Unit test declarations. */
46 static void run_tests();
47 static void test_count_array_new();
48 static void test_bitblock_new();
49 static void test_bitblock_print();
50 static void test_bitblock_list_print();
51 static void test_scan_seq();
52 static void test_blocks2motif();
55 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
58 int main( int argc, char *argv[] )
66 run_scan( argc, argv );
72 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
77 /* Martin A. Hansen, September 2008 */
79 /* Print usage and exit if no files in argument. */
82 "Usage: bipartite_scam <FASTA file(s)> > result.csv\n"
89 void run_scan( int argc, char *argv[] )
91 /* Martin A. Hansen, September 2008 */
93 /* For each file in argv scan the file for */
94 /* bipartite motifs and output the motifs */
95 /* and their count. */
99 seq_entry *entry = NULL;
100 uint *count_array = NULL;
101 // size_t new_nmemb = 0;
103 count_array = count_array_new( COUNT_ARRAY_NMEMB );
105 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
107 for ( i = 1; i < argc; i++ )
111 fprintf( stderr, "Scanning file: %s\n", file );
112 scan_file( file, entry, count_array );
113 fprintf( stderr, "done.\n" );
116 fprintf( stderr, "Printing motifs: ... " );
117 count_array_print( count_array, COUNT_ARRAY_NMEMB, CUTOFF );
118 fprintf( stderr, "done.\n" );
120 seq_destroy( entry );
122 mem_free( &count_array );
126 uint *count_array_new( size_t nmemb )
128 /* Martin A. Hansen, September 2008 */
130 /* Initialize a new zeroed uint array of nmemb objects. */
136 array = mem_get_zero( nmemb * sizeof( uint ) );
142 void scan_file( char *file, seq_entry *entry, uint *count_array )
144 /* Martin A. Hansen, September 2008 */
146 /* Scan all FASTA entries of a file in both */
147 /* sense and anti-sense directions. */
149 FILE *fp = read_open( file );
151 while ( fasta_get_entry( fp, &entry ) == TRUE )
153 fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
155 scan_seq( entry->seq, entry->seq_len, count_array );
157 fprintf( stderr, "done.\n" );
164 void scan_seq( char *seq, size_t seq_len, uint *count_array )
166 /* Martin A. Hansen, September 2008 */
168 /* Run a sliding window over a given sequence. The window */
169 /* consists of a list where new blocks of 4 nucleotides */
170 /* are pushed onto one end while at the same time old */
171 /* blocks are popped from the other end. The number of */
172 /* in the list is determined by the maximum seperator. */
173 /* Everytime we have a full window, the window is scanned */
176 bitblock *block = NULL;
181 bool first_node = TRUE;
182 node_sl *new_node = NULL;
183 node_sl *old_node = NULL;
184 list_sl *list = list_sl_new();
186 for ( i = 0; seq[ i ]; i++ )
192 case 'A': case 'a': add_A( bin ); break;
193 case 'T': case 't': add_T( bin ); break;
194 case 'C': case 'c': add_C( bin ); break;
195 case 'G': case 'g': add_G( bin ); break;
196 default: n_count = BLOCK_SIZE_NT; break;
199 if ( i > BLOCK_SIZE_NT - 2 )
203 block = bitblock_new();
212 new_node = node_sl_new();
213 new_node->val = block;
216 list_sl_add_beg( &list, &new_node );
218 list_sl_add_after( &old_node, &new_node );
225 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
227 // bitblock_list_print( list ); /* DEBUG */
229 scan_list( list, count_array );
231 list_sl_remove_beg( &list );
236 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
237 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
239 // bitblock_list_print( list ); /* DEBUG */
241 scan_list( list, count_array );
244 list_sl_destroy( &list );
248 void scan_list( list_sl *list, uint *count_array )
250 /* Martin A. Hansen, September 2008 */
252 /* Scan a list of blocks for biparite motifs by creating */
253 /* a binary motif consisting of two blocks of 4 nucleotides */
254 /* along with the distance separating them. Motifs containing */
255 /* N's are skipped. */
257 node_sl *first_node = NULL;
258 node_sl *next_node = NULL;
259 bitblock *block1 = NULL;
260 bitblock *block2 = NULL;
265 // bitblock_list_print( list );
267 first_node = list->first;
269 block1 = ( bitblock * ) first_node->val;
271 if ( ! block1->hasN )
273 next_node = first_node->next;
275 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) {
276 next_node = next_node->next;
279 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
281 block2 = ( bitblock * ) next_node->val;
283 if ( ! block2->hasN )
285 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
287 // bitblock_list_print( list ); /* DEBUG */
289 count_array[ motif_bin ]++;
298 bitblock *bitblock_new()
300 /* Martin A. Hansen, September 2008 */
302 /* Initializes a new empty bitblock. */
304 bitblock *new_block = NULL;
306 new_block = mem_get( sizeof( bitblock ) );
309 new_block->hasN = FALSE;
315 uint blocks2motif( uchar bin1, uchar bin2, ushort dist )
317 /* Martin A. Hansen, September 2008 */
319 /* Given two binary encoded tetra nuceotide blocks, */
320 /* and the distance separating this, create a binary */
321 /* bipartite motif. */
327 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
331 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
339 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff )
341 /* Martin A. Hansen, Seqptember 2008. */
343 /* Print all bipartite motifs in count_array as */
344 /* tabular output. */
350 for ( i = 0; i < nmemb; i++ )
353 count = count_array[ i ];
355 if ( count >= cutoff ) {
356 printf( "%u\t%u\n", motif, count );
362 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
367 fprintf( stderr, "Running tests\n" );
369 test_count_array_new();
371 test_bitblock_print();
372 test_bitblock_list_print();
376 fprintf( stderr, "All tests OK\n" );
380 void test_count_array_new()
382 fprintf( stderr, " Running test_count_array_new ... " );
388 array = count_array_new( nmemb );
390 for ( i = 0; i < nmemb; i++ ) {
391 assert( array[ i ] == 0 );
396 fprintf( stderr, "done.\n" );
400 void test_bitblock_new()
402 fprintf( stderr, " Running test_bitblock_new ... " );
404 bitblock *new_block = bitblock_new();
406 assert( new_block->bin == 0 );
407 assert( new_block->hasN == FALSE );
409 fprintf( stderr, "done.\n" );
413 void test_bitblock_print()
415 fprintf( stderr, " Running test_bitblock_print ... " );
417 bitblock *new_block = bitblock_new();
420 new_block->hasN = TRUE;
422 // bitblock_print( new_block );
424 fprintf( stderr, "done.\n");
428 void test_bitblock_list_print()
430 fprintf( stderr, " Running test_bitblock_list_print ... " );
432 list_sl *list = list_sl_new();
433 node_sl *node1 = node_sl_new();
434 node_sl *node2 = node_sl_new();
435 node_sl *node3 = node_sl_new();
436 bitblock *block1 = bitblock_new();
437 bitblock *block2 = bitblock_new();
438 bitblock *block3 = bitblock_new();
451 list_sl_add_beg( &list, &node1 );
452 list_sl_add_beg( &list, &node2 );
453 list_sl_add_beg( &list, &node3 );
455 // bitblock_list_print( list );
457 fprintf( stderr, "done.\n" );
463 fprintf( stderr, " Running test_scan_seq ... " );
465 //char *seq = "AAAANTCGGCTNGGGG";
466 //char *seq = "AAAATCGGCTGGGG";
467 char *seq = "AAAAAAAAAAAAAAG";
468 size_t seq_len = strlen( seq );
469 size_t nmemb = 1 << 5;
471 uint *count_array = count_array_new( nmemb );
473 scan_seq( seq, seq_len, count_array );
475 // count_array_print( count_array, nmemb, 1 ); /* DEBUG */
477 fprintf( stderr, "done.\n" );
481 static void test_blocks2motif()
483 fprintf( stderr, " Running test_blocks2motif ... " );
490 motif = blocks2motif( bin1, bin2, dist );
492 // printf( "motif: %d\n", motif );
494 fprintf( stderr, "done.\n");
498 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */