1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
10 #define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
11 #define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
12 #define BITS_IN_BYTE 8 /* number of bits in one byte. */
13 #define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
14 #define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */
15 #define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
16 #define CUTOFF 1 /* minimum number of motifs in output. */
18 /* Structure that will hold one tetra nucleotide block. */
21 uchar bin; /* Tetra nucleotide binary encoded. */
22 bool hasN; /* Flag indicating any N's in the block. */
25 typedef struct _bitblock bitblock;
27 /* Function declarations. */
28 void run_scan( int argc, char *argv[] );
30 void scan_file( char *file, seq_entry *entry, uint *count_array );
31 void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff );
32 uint *count_array_new( size_t nmemb );
33 void scan_seq( char *seq, size_t seq_len, uint *count_array );
34 void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff );
35 void scan_list( list_sl *list, uint *count_array );
36 void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff, uint *output_array );
37 bitblock *bitblock_new();
38 uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
39 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff );
40 void bitblock_list_print( list_sl *list );
41 void bitblock_print( bitblock *out );
43 /* Unit test declarations. */
44 static void run_tests();
45 static void test_count_array_new();
46 static void test_bitblock_new();
47 static void test_bitblock_print();
48 static void test_bitblock_list_print();
49 static void test_scan_seq();
50 static void test_blocks2motif();
53 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
56 int main( int argc, char *argv[] )
64 run_scan( argc, argv );
70 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
75 /* Martin A. Hansen, September 2008 */
77 /* Print usage and exit if no files in argument. */
79 fprintf( stderr, "Usage: bipartite_scam <FASTA file(s)> > result.csv\n" );
85 void run_scan( int argc, char *argv[] )
87 /* Martin A. Hansen, September 2008 */
89 /* For each file in argv scan the file for */
90 /* bipartite motifs and output the motifs */
91 /* and their count. */
95 seq_entry *entry = NULL;
96 uint *count_array = NULL;
97 // size_t new_nmemb = 0;
99 count_array = count_array_new( COUNT_ARRAY_NMEMB );
101 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
103 for ( i = 1; i < argc; i++ )
107 fprintf( stderr, "Scanning file: %s\n", file );
108 scan_file( file, entry, count_array );
109 fprintf( stderr, "done.\n" );
112 // fprintf( stderr, "Printing motifs: ... " );
113 // count_array_print( count_array, COUNT_ARRAY_NMEMB, CUTOFF );
114 // fprintf( stderr, "done.\n" );
118 fprintf( stderr, "Rescanning file: %s\n", file );
119 rescan_file( file, entry, count_array, CUTOFF );
120 fprintf( stderr, "done.\n" );
122 seq_destroy( entry );
124 mem_free( &count_array );
128 uint *count_array_new( size_t nmemb )
130 /* Martin A. Hansen, September 2008 */
132 /* Initialize a new zeroed uint array of nmemb objects. */
138 array = mem_get_zero( nmemb * sizeof( uint ) );
144 void scan_file( char *file, seq_entry *entry, uint *count_array )
146 /* Martin A. Hansen, September 2008 */
148 /* Scan all FASTA entries of a file. */
150 FILE *fp = read_open( file );
152 while ( fasta_get_entry( fp, &entry ) == TRUE )
154 fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
156 scan_seq( entry->seq, entry->seq_len, count_array );
158 fprintf( stderr, "done.\n" );
165 void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff )
167 /* Martin A. Hansen, September 2008 */
169 /* Rescan all FASTA entries of a file. */
171 FILE *fp = read_open( file );
173 while ( fasta_get_entry( fp, &entry ) == TRUE )
175 fprintf( stderr, " Rescanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
177 printf( "SEQ_NAME: %s\n", entry->seq_name );
179 rescan_seq( entry->seq, entry->seq_len, count_array, cutoff );
181 fprintf( stderr, "done.\n" );
188 void scan_seq( char *seq, size_t seq_len, uint *count_array )
190 /* Martin A. Hansen, September 2008 */
192 /* Run a sliding window over a given sequence. The window */
193 /* consists of a list where new blocks of 4 nucleotides */
194 /* are pushed onto one end while at the same time old */
195 /* blocks are popped from the other end. The number of */
196 /* in the list is determined by the maximum seperator. */
197 /* Everytime we have a full window, the window is scanned */
200 bitblock *block = NULL;
205 bool first_node = TRUE;
206 node_sl *new_node = NULL;
207 node_sl *old_node = NULL;
208 list_sl *list = list_sl_new();
210 for ( i = 0; seq[ i ]; i++ )
216 case 'A': case 'a': add_A( bin ); break;
217 case 'T': case 't': add_T( bin ); break;
218 case 'C': case 'c': add_C( bin ); break;
219 case 'G': case 'g': add_G( bin ); break;
220 default: n_count = BLOCK_SIZE_NT; break;
223 if ( i > BLOCK_SIZE_NT - 2 )
227 block = bitblock_new();
236 new_node = node_sl_new();
237 new_node->val = block;
240 list_sl_add_beg( &list, &new_node );
242 list_sl_add_after( &old_node, &new_node );
249 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
251 // bitblock_list_print( list ); /* DEBUG */
253 scan_list( list, count_array );
255 mem_free( &list->first->val );
257 list_sl_remove_beg( &list );
262 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
263 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
265 // bitblock_list_print( list ); /* DEBUG */
267 scan_list( list, count_array );
270 list_sl_destroy( &list );
274 void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff )
276 /* Martin A. Hansen, September 2008 */
278 /* Run a sliding window over a given sequence. The window */
279 /* consists of a list where new blocks of 4 nucleotides */
280 /* are pushed onto one end while at the same time old */
281 /* blocks are popped from the other end. The number of */
282 /* in the list is determined by the maximum seperator. */
283 /* Everytime we have a full window, the window is scanned */
286 bitblock *block = NULL;
291 bool first_node = TRUE;
292 node_sl *new_node = NULL;
293 node_sl *old_node = NULL;
294 list_sl *list = list_sl_new();
295 uint *output_array = NULL;
297 output_array = mem_get_zero( sizeof( uint ) * ( seq_len + 1 ) );
299 for ( i = 0; seq[ i ]; i++ )
305 case 'A': case 'a': add_A( bin ); break;
306 case 'T': case 't': add_T( bin ); break;
307 case 'C': case 'c': add_C( bin ); break;
308 case 'G': case 'g': add_G( bin ); break;
309 default: n_count = BLOCK_SIZE_NT; break;
312 if ( i > BLOCK_SIZE_NT - 2 )
316 block = bitblock_new();
325 new_node = node_sl_new();
326 new_node->val = block;
329 list_sl_add_beg( &list, &new_node );
331 list_sl_add_after( &old_node, &new_node );
338 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
340 // bitblock_list_print( list ); /* DEBUG */
342 rescan_list( list, count_array, i, cutoff, output_array );
344 mem_free( &list->first->val );
346 list_sl_remove_beg( &list );
351 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
352 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
354 // bitblock_list_print( list ); /* DEBUG */
356 rescan_list( list, count_array, i, cutoff, output_array );
359 list_sl_destroy( &list );
361 for ( i = 0; i < seq_len; i++ ) {
362 printf( "%zu\t%u\n", i, output_array[ i ] );
365 free( output_array );
369 void scan_list( list_sl *list, uint *count_array )
371 /* Martin A. Hansen, September 2008 */
373 /* Scan a list of blocks for biparite motifs by creating */
374 /* a binary motif consisting of two blocks of 4 nucleotides */
375 /* along with the distance separating them. Motifs containing */
376 /* N's are skipped. */
378 node_sl *first_node = NULL;
379 node_sl *next_node = NULL;
380 bitblock *block1 = NULL;
381 bitblock *block2 = NULL;
386 // bitblock_list_print( list );
388 first_node = list->first;
390 block1 = ( bitblock * ) first_node->val;
392 if ( ! block1->hasN )
394 next_node = first_node->next;
396 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) {
397 next_node = next_node->next;
400 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
402 block2 = ( bitblock * ) next_node->val;
404 if ( ! block2->hasN )
406 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
408 // bitblock_list_print( list ); /* DEBUG */
410 count_array[ motif_bin ]++;
419 void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff, uint *output_array )
421 /* Martin A. Hansen, September 2008 */
423 /* Scan a list of blocks for biparite motifs by creating */
424 /* a binary motif consisting of two blocks of 4 nucleotides */
425 /* along with the distance separating them. Motifs containing */
426 /* N's are skipped. */
428 node_sl *first_node = NULL;
429 node_sl *next_node = NULL;
430 bitblock *block1 = NULL;
431 bitblock *block2 = NULL;
439 // bitblock_list_print( list );
441 first_node = list->first;
443 block1 = ( bitblock * ) first_node->val;
445 if ( ! block1->hasN )
447 next_node = first_node->next;
449 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ )
451 next_node = next_node->next;
456 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
458 block2 = ( bitblock * ) next_node->val;
460 if ( ! block2->hasN )
462 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
464 // bitblock_list_print( list ); /* DEBUG */
466 count = count_array[ motif_bin ];
468 if ( count > cutoff )
470 // printf( "%zu\t%u\t%u\n", pos + j, motif_bin, count );
472 for ( k = 0; k < BLOCK_SIZE_NT - 1; k++ )
474 output_array[ pos - j + k - BLOCK_SIZE_NT ] += count;
475 output_array[ pos + k - dist - BLOCK_SIZE_NT ] += count;
488 bitblock *bitblock_new()
490 /* Martin A. Hansen, September 2008 */
492 /* Initializes a new empty bitblock. */
494 bitblock *new_block = NULL;
496 new_block = mem_get( sizeof( bitblock ) );
499 new_block->hasN = FALSE;
505 uint blocks2motif( uchar bin1, uchar bin2, ushort dist )
507 /* Martin A. Hansen, September 2008 */
509 /* Given two binary encoded tetra nuceotide blocks, */
510 /* and the distance separating this, create a binary */
511 /* bipartite motif. */
517 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
521 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
529 void count_array_print( uint *count_array, size_t nmemb, size_t cutoff )
531 /* Martin A. Hansen, Seqptember 2008. */
533 /* Print all bipartite motifs in count_array as */
534 /* tabular output. */
540 for ( i = 0; i < nmemb; i++ )
543 count = count_array[ i ];
545 if ( count >= cutoff ) {
546 printf( "%u\t%u\n", motif, count );
552 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
557 fprintf( stderr, "Running tests\n" );
559 test_count_array_new();
561 test_bitblock_print();
562 test_bitblock_list_print();
566 fprintf( stderr, "All tests OK\n" );
570 void test_count_array_new()
572 fprintf( stderr, " Running test_count_array_new ... " );
578 array = count_array_new( nmemb );
580 for ( i = 0; i < nmemb; i++ ) {
581 assert( array[ i ] == 0 );
586 fprintf( stderr, "done.\n" );
590 void test_bitblock_new()
592 fprintf( stderr, " Running test_bitblock_new ... " );
594 bitblock *new_block = bitblock_new();
596 assert( new_block->bin == 0 );
597 assert( new_block->hasN == FALSE );
599 fprintf( stderr, "done.\n" );
603 void test_bitblock_print()
605 fprintf( stderr, " Running test_bitblock_print ... " );
607 bitblock *new_block = bitblock_new();
610 new_block->hasN = TRUE;
612 // bitblock_print( new_block );
614 fprintf( stderr, "done.\n");
618 void test_bitblock_list_print()
620 fprintf( stderr, " Running test_bitblock_list_print ... " );
622 list_sl *list = list_sl_new();
623 node_sl *node1 = node_sl_new();
624 node_sl *node2 = node_sl_new();
625 node_sl *node3 = node_sl_new();
626 bitblock *block1 = bitblock_new();
627 bitblock *block2 = bitblock_new();
628 bitblock *block3 = bitblock_new();
641 list_sl_add_beg( &list, &node1 );
642 list_sl_add_beg( &list, &node2 );
643 list_sl_add_beg( &list, &node3 );
645 // bitblock_list_print( list );
647 fprintf( stderr, "done.\n" );
653 fprintf( stderr, " Running test_scan_seq ... " );
655 //char *seq = "AAAANTCGGCTNGGGG";
656 //char *seq = "AAAATCGGCTGGGG";
657 char *seq = "AAAAAAAAAAAAAAG";
658 size_t seq_len = strlen( seq );
659 size_t nmemb = 1 << 5;
661 uint *count_array = count_array_new( nmemb );
663 scan_seq( seq, seq_len, count_array );
665 // count_array_print( count_array, nmemb, 1 ); /* DEBUG */
667 fprintf( stderr, "done.\n" );
671 static void test_blocks2motif()
673 fprintf( stderr, " Running test_blocks2motif ... " );
680 motif = blocks2motif( bin1, bin2, dist );
682 // printf( "motif: %d\n", motif );
684 fprintf( stderr, "done.\n");
688 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */