1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
10 #define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
11 #define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
12 #define BITS_IN_BYTE 8 /* number of bits in one byte. */
13 #define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
14 #define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */
15 #define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
16 #define CUTOFF 1 /* minimum number of motifs in output. */
18 /* Structure that will hold one tetra nucleotide block. */
21 uchar bin; /* Tetra nucleotide binary encoded. */
22 bool hasN; /* Flag indicating any N's in the block. */
25 typedef struct _bitblock bitblock;
27 /* Function declarations. */
28 void run_scan( int argc, char *argv[] );
30 void scan_file( char *file, seq_entry *entry, uint *count_array );
31 void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff );
32 uint *count_array_new( size_t nmemb );
33 void scan_seq( char *seq, size_t seq_len, uint *count_array );
34 void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff );
35 void scan_list( list_sl *list, uint *count_array );
36 void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff );
37 bitblock *bitblock_new();
38 uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
39 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff );
40 void bitblock_list_print( list_sl *list );
41 void bitblock_print( bitblock *out );
43 /* Unit test declarations. */
44 static void run_tests();
45 static void test_count_array_new();
46 static void test_bitblock_new();
47 static void test_bitblock_print();
48 static void test_bitblock_list_print();
49 static void test_scan_seq();
50 static void test_blocks2motif();
53 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
56 int main( int argc, char *argv[] )
64 run_scan( argc, argv );
70 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
75 /* Martin A. Hansen, September 2008 */
77 /* Print usage and exit if no files in argument. */
79 fprintf( stderr, "Usage: bipartite_scam <FASTA file(s)> > result.csv\n" );
85 void run_scan( int argc, char *argv[] )
87 /* Martin A. Hansen, September 2008 */
89 /* For each file in argv scan the file for */
90 /* bipartite motifs and output the motifs */
91 /* and their count. */
95 seq_entry *entry = NULL;
96 uint *count_array = NULL;
97 // size_t new_nmemb = 0;
99 count_array = count_array_new( COUNT_ARRAY_NMEMB );
101 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
103 for ( i = 1; i < argc; i++ )
107 fprintf( stderr, "Scanning file: %s\n", file );
108 scan_file( file, entry, count_array );
109 fprintf( stderr, "done.\n" );
112 fprintf( stderr, "Printing motifs: ... " );
113 count_array_print( count_array, COUNT_ARRAY_NMEMB, CUTOFF );
114 fprintf( stderr, "done.\n" );
118 fprintf( stderr, "Rescanning file: %s\n", file );
119 rescan_file( file, entry, count_array, CUTOFF );
120 fprintf( stderr, "done.\n" );
122 seq_destroy( entry );
124 mem_free( &count_array );
128 uint *count_array_new( size_t nmemb )
130 /* Martin A. Hansen, September 2008 */
132 /* Initialize a new zeroed uint array of nmemb objects. */
138 array = mem_get_zero( nmemb * sizeof( uint ) );
144 void scan_file( char *file, seq_entry *entry, uint *count_array )
146 /* Martin A. Hansen, September 2008 */
148 /* Scan all FASTA entries of a file. */
150 FILE *fp = read_open( file );
152 while ( fasta_get_entry( fp, &entry ) == TRUE )
154 fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
156 scan_seq( entry->seq, entry->seq_len, count_array );
158 fprintf( stderr, "done.\n" );
165 void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff )
167 /* Martin A. Hansen, September 2008 */
169 /* Rescan all FASTA entries of a file. */
171 FILE *fp = read_open( file );
173 while ( fasta_get_entry( fp, &entry ) == TRUE )
175 fprintf( stderr, " Rescanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
177 rescan_seq( entry->seq, entry->seq_len, count_array, cutoff );
179 fprintf( stderr, "done.\n" );
186 void scan_seq( char *seq, size_t seq_len, uint *count_array )
188 /* Martin A. Hansen, September 2008 */
190 /* Run a sliding window over a given sequence. The window */
191 /* consists of a list where new blocks of 4 nucleotides */
192 /* are pushed onto one end while at the same time old */
193 /* blocks are popped from the other end. The number of */
194 /* in the list is determined by the maximum seperator. */
195 /* Everytime we have a full window, the window is scanned */
198 bitblock *block = NULL;
203 bool first_node = TRUE;
204 node_sl *new_node = NULL;
205 node_sl *old_node = NULL;
206 list_sl *list = list_sl_new();
208 for ( i = 0; seq[ i ]; i++ )
214 case 'A': case 'a': add_A( bin ); break;
215 case 'T': case 't': add_T( bin ); break;
216 case 'C': case 'c': add_C( bin ); break;
217 case 'G': case 'g': add_G( bin ); break;
218 default: n_count = BLOCK_SIZE_NT; break;
221 if ( i > BLOCK_SIZE_NT - 2 )
225 block = bitblock_new();
234 new_node = node_sl_new();
235 new_node->val = block;
238 list_sl_add_beg( &list, &new_node );
240 list_sl_add_after( &old_node, &new_node );
247 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
249 // bitblock_list_print( list ); /* DEBUG */
251 scan_list( list, count_array );
253 list_sl_remove_beg( &list );
258 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
259 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
261 // bitblock_list_print( list ); /* DEBUG */
263 scan_list( list, count_array );
266 list_sl_destroy( &list );
270 void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff )
272 /* Martin A. Hansen, September 2008 */
274 /* Run a sliding window over a given sequence. The window */
275 /* consists of a list where new blocks of 4 nucleotides */
276 /* are pushed onto one end while at the same time old */
277 /* blocks are popped from the other end. The number of */
278 /* in the list is determined by the maximum seperator. */
279 /* Everytime we have a full window, the window is scanned */
282 bitblock *block = NULL;
287 bool first_node = TRUE;
288 node_sl *new_node = NULL;
289 node_sl *old_node = NULL;
290 list_sl *list = list_sl_new();
292 for ( i = 0; seq[ i ]; i++ )
298 case 'A': case 'a': add_A( bin ); break;
299 case 'T': case 't': add_T( bin ); break;
300 case 'C': case 'c': add_C( bin ); break;
301 case 'G': case 'g': add_G( bin ); break;
302 default: n_count = BLOCK_SIZE_NT; break;
305 if ( i > BLOCK_SIZE_NT - 2 )
309 block = bitblock_new();
318 new_node = node_sl_new();
319 new_node->val = block;
322 list_sl_add_beg( &list, &new_node );
324 list_sl_add_after( &old_node, &new_node );
331 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
333 // bitblock_list_print( list ); /* DEBUG */
335 rescan_list( list, count_array, i, cutoff );
337 list_sl_remove_beg( &list );
342 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
343 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
345 // bitblock_list_print( list ); /* DEBUG */
347 rescan_list( list, count_array, i, cutoff );
350 list_sl_destroy( &list );
354 void scan_list( list_sl *list, uint *count_array )
356 /* Martin A. Hansen, September 2008 */
358 /* Scan a list of blocks for biparite motifs by creating */
359 /* a binary motif consisting of two blocks of 4 nucleotides */
360 /* along with the distance separating them. Motifs containing */
361 /* N's are skipped. */
363 node_sl *first_node = NULL;
364 node_sl *next_node = NULL;
365 bitblock *block1 = NULL;
366 bitblock *block2 = NULL;
371 // bitblock_list_print( list );
373 first_node = list->first;
375 block1 = ( bitblock * ) first_node->val;
377 if ( ! block1->hasN )
379 next_node = first_node->next;
381 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) {
382 next_node = next_node->next;
385 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
387 block2 = ( bitblock * ) next_node->val;
389 if ( ! block2->hasN )
391 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
393 // bitblock_list_print( list ); /* DEBUG */
395 count_array[ motif_bin ]++;
404 void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff )
406 /* Martin A. Hansen, September 2008 */
408 /* Scan a list of blocks for biparite motifs by creating */
409 /* a binary motif consisting of two blocks of 4 nucleotides */
410 /* along with the distance separating them. Motifs containing */
411 /* N's are skipped. */
413 node_sl *first_node = NULL;
414 node_sl *next_node = NULL;
415 bitblock *block1 = NULL;
416 bitblock *block2 = NULL;
423 // bitblock_list_print( list );
425 first_node = list->first;
427 block1 = ( bitblock * ) first_node->val;
429 if ( ! block1->hasN )
431 next_node = first_node->next;
433 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ )
435 next_node = next_node->next;
440 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
442 block2 = ( bitblock * ) next_node->val;
444 if ( ! block2->hasN )
446 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
448 // bitblock_list_print( list ); /* DEBUG */
450 count = count_array[ motif_bin ];
452 if ( count > cutoff ) {
453 printf( "%zu\t%u\t%u\n", pos + j, motif_bin, count );
465 bitblock *bitblock_new()
467 /* Martin A. Hansen, September 2008 */
469 /* Initializes a new empty bitblock. */
471 bitblock *new_block = NULL;
473 new_block = mem_get( sizeof( bitblock ) );
476 new_block->hasN = FALSE;
482 uint blocks2motif( uchar bin1, uchar bin2, ushort dist )
484 /* Martin A. Hansen, September 2008 */
486 /* Given two binary encoded tetra nuceotide blocks, */
487 /* and the distance separating this, create a binary */
488 /* bipartite motif. */
494 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
498 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
506 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff )
508 /* Martin A. Hansen, Seqptember 2008. */
510 /* Print all bipartite motifs in count_array as */
511 /* tabular output. */
517 for ( i = 0; i < nmemb; i++ )
520 count = count_array[ i ];
522 if ( count >= cutoff ) {
523 printf( "%u\t%u\n", motif, count );
529 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
534 fprintf( stderr, "Running tests\n" );
536 test_count_array_new();
538 test_bitblock_print();
539 test_bitblock_list_print();
543 fprintf( stderr, "All tests OK\n" );
547 void test_count_array_new()
549 fprintf( stderr, " Running test_count_array_new ... " );
555 array = count_array_new( nmemb );
557 for ( i = 0; i < nmemb; i++ ) {
558 assert( array[ i ] == 0 );
563 fprintf( stderr, "done.\n" );
567 void test_bitblock_new()
569 fprintf( stderr, " Running test_bitblock_new ... " );
571 bitblock *new_block = bitblock_new();
573 assert( new_block->bin == 0 );
574 assert( new_block->hasN == FALSE );
576 fprintf( stderr, "done.\n" );
580 void test_bitblock_print()
582 fprintf( stderr, " Running test_bitblock_print ... " );
584 bitblock *new_block = bitblock_new();
587 new_block->hasN = TRUE;
589 // bitblock_print( new_block );
591 fprintf( stderr, "done.\n");
595 void test_bitblock_list_print()
597 fprintf( stderr, " Running test_bitblock_list_print ... " );
599 list_sl *list = list_sl_new();
600 node_sl *node1 = node_sl_new();
601 node_sl *node2 = node_sl_new();
602 node_sl *node3 = node_sl_new();
603 bitblock *block1 = bitblock_new();
604 bitblock *block2 = bitblock_new();
605 bitblock *block3 = bitblock_new();
618 list_sl_add_beg( &list, &node1 );
619 list_sl_add_beg( &list, &node2 );
620 list_sl_add_beg( &list, &node3 );
622 // bitblock_list_print( list );
624 fprintf( stderr, "done.\n" );
630 fprintf( stderr, " Running test_scan_seq ... " );
632 //char *seq = "AAAANTCGGCTNGGGG";
633 //char *seq = "AAAATCGGCTGGGG";
634 char *seq = "AAAAAAAAAAAAAAG";
635 size_t seq_len = strlen( seq );
636 size_t nmemb = 1 << 5;
638 uint *count_array = count_array_new( nmemb );
640 scan_seq( seq, seq_len, count_array );
642 // count_array_print( count_array, nmemb, 1 ); /* DEBUG */
644 fprintf( stderr, "done.\n" );
648 static void test_blocks2motif()
650 fprintf( stderr, " Running test_blocks2motif ... " );
657 motif = blocks2motif( bin1, bin2, dist );
659 // printf( "motif: %d\n", motif );
661 fprintf( stderr, "done.\n");
665 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */