8 #define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
9 #define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
10 #define BITS_IN_BYTE 8 /* number of bits in one byte. */
11 #define BLOCK_SPACE_MAX 2 /* maximum space between two blocks. */
12 #define BLOCK_MASK ( BLOCK_SPACE_MAX - 1 ) /* mask for printing block space. */
13 #define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
14 #define CUTOFF 1 /* minimum number of motifs in output. */
16 #define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
17 #define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
18 #define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
19 #define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
21 /* Structure that will hold one tetra nucleotide block. */
24 uchar bin; /* Tetra nucleotide binary encoded. */
25 bool hasN; /* Flag indicating any N's in the block. */
28 typedef struct _bitblock bitblock;
30 /* Byte array for fast convertion of binary blocks back to DNA. */
31 char *bin2dna[256] = {
32 "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
33 "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
34 "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
35 "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
36 "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
37 "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
38 "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
39 "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
40 "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
41 "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
42 "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
43 "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
44 "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
45 "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
46 "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
47 "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
48 "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
49 "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
50 "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
51 "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
52 "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
53 "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
54 "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
55 "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
56 "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
57 "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
58 "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
59 "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
60 "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
61 "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
62 "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
63 "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
66 /* Function declarations. */
67 void run_scan( int argc, char *argv[] );
69 void scan_file( char *file, seq_entry *entry, uint *count_array );
70 uint *count_array_new( size_t nmemb );
71 void scan_seq( char *seq, size_t seq_len, uint *count_array );
72 void scan_list( list_sl *list, uint *count_array );
73 bitblock *bitblock_new();
74 uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
75 size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff );
76 int cmp_uint_desc( const void *a, const void *b );
77 void count_array_print( uint *count_array, size_t nmemb );
78 void motif_print( uint motif, uint count );
79 void bitblock_list_print( list_sl *list );
80 void bitblock_print( bitblock *out );
82 /* Unit test declarations. */
83 static void run_tests();
84 static void test_count_array_new();
85 static void test_bitblock_new();
86 static void test_bitblock_print();
87 static void test_bitblock_list_print();
88 static void test_scan_seq();
89 static void test_filter_results();
90 static void test_array_sort();
91 static void test_blocks2motif();
94 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
97 int main( int argc, char *argv[] )
105 run_scan( argc, argv );
111 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
116 /* Martin A. Hansen, September 2008 */
118 /* Print usage and exit if no files in argument. */
121 "Usage: bipartite_scam <FASTA file(s)> > result.csv\n"
124 exit( EXIT_SUCCESS );
128 void run_scan( int argc, char *argv[] )
130 /* Martin A. Hansen, September 2008 */
132 /* For each file in argv scan the file for */
133 /* bipartite motifs and output the motifs */
134 /* and their count. */
138 seq_entry *entry = NULL;
139 uint *count_array = NULL;
140 size_t new_nmemb = 0;
142 count_array = count_array_new( COUNT_ARRAY_NMEMB );
144 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
146 for ( i = 1; i < argc; i++ )
150 fprintf( stderr, "Scanning file: %s\n", file );
151 scan_file( file, entry, count_array );
152 fprintf( stderr, "done.\n" );
155 fprintf( stderr, "Filtering results (cutoff=%d) ... ", CUTOFF );
156 new_nmemb = filter_results( &count_array, COUNT_ARRAY_NMEMB, CUTOFF );
157 fprintf( stderr, "done.\n" );
159 fprintf( stderr, "Sorting motifs: ... " );
160 qsort( count_array, new_nmemb, sizeof( uint ), cmp_uint_desc );
161 fprintf( stderr, "done.\n" );
163 fprintf( stderr, "Printing motifs: ... " );
164 count_array_print( count_array, new_nmemb );
165 fprintf( stderr, "done.\n" );
167 seq_destroy( entry );
169 mem_free( &count_array );
173 uint *count_array_new( size_t nmemb )
175 /* Martin A. Hansen, September 2008 */
177 /* Initialize a new zeroed uint array of nmemb objects. */
183 array = mem_get_zero( nmemb * sizeof( uint ) );
189 void scan_file( char *file, seq_entry *entry, uint *count_array )
191 /* Martin A. Hansen, September 2008 */
193 /* Scan all FASTA entries of a file in both */
194 /* sense and anti-sense directions. */
196 FILE *fp = read_open( file );
198 while ( fasta_get_entry( fp, &entry ) == TRUE )
200 fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
202 scan_seq( entry->seq, entry->seq_len, count_array );
204 fprintf( stderr, "done.\n" );
211 void scan_seq( char *seq, size_t seq_len, uint *count_array )
213 /* Martin A. Hansen, September 2008 */
215 /* Run a sliding window over a given sequence. The window */
216 /* consists of a list where new blocks of 4 nucleotides */
217 /* are pushed onto one end while at the same time old */
218 /* blocks are popped from the other end. The number of */
219 /* in the list is determined by the maximum seperator. */
220 /* Everytime we have a full window, the window is scanned */
223 bitblock *block = NULL;
228 bool first_node = TRUE;
229 node_sl *new_node = NULL;
230 node_sl *old_node = NULL;
231 list_sl *list = list_sl_new();
233 for ( i = 0; seq[ i ]; i++ )
239 case 'A': case 'a': add_A( bin ); break;
240 case 'T': case 't': add_T( bin ); break;
241 case 'C': case 'c': add_C( bin ); break;
242 case 'G': case 'g': add_G( bin ); break;
243 default: n_count = BLOCK_SIZE_NT; break;
246 if ( i > BLOCK_SIZE_NT - 2 )
250 block = bitblock_new();
259 new_node = node_sl_new();
260 new_node->val = block;
263 list_sl_add_beg( &list, &new_node );
265 list_sl_add_after( &old_node, &new_node );
272 if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
274 // bitblock_list_print( list ); /* DEBUG */
276 scan_list( list, count_array );
278 list_sl_remove_beg( &list );
283 /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
284 if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
286 // bitblock_list_print( list ); /* DEBUG */
288 scan_list( list, count_array );
291 list_sl_destroy( &list );
295 void scan_list( list_sl *list, uint *count_array )
297 /* Martin A. Hansen, September 2008 */
299 /* Scan a list of blocks for biparite motifs by creating */
300 /* a binary motif consisting of two blocks of 4 nucleotides */
301 /* along with the distance separating them. Motifs containing */
302 /* N's are skipped. */
304 node_sl *first_node = NULL;
305 node_sl *next_node = NULL;
306 bitblock *block1 = NULL;
307 bitblock *block2 = NULL;
312 // bitblock_list_print( list );
314 first_node = list->first;
316 block1 = ( bitblock * ) first_node->val;
318 if ( ! block1->hasN )
320 next_node = first_node->next;
322 for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) {
323 next_node = next_node->next;
326 for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
328 block2 = ( bitblock * ) next_node->val;
330 // printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */
332 if ( ! block2->hasN )
334 motif_bin = blocks2motif( block1->bin, block2->bin, dist );
336 // motif_print( motif_bin, 0 ); /* DEBUG */
337 // bitblock_list_print( list ); /* DEBUG */
339 count_array[ motif_bin ]++;
348 bitblock *bitblock_new()
350 /* Martin A. Hansen, September 2008 */
352 /* Initializes a new empty bitblock. */
354 bitblock *new_block = NULL;
356 new_block = mem_get( sizeof( bitblock ) );
359 new_block->hasN = FALSE;
365 uint blocks2motif( uchar bin1, uchar bin2, ushort dist )
367 /* Martin A. Hansen, September 2008 */
369 /* Given two binary encoded tetra nuceotide blocks, */
370 /* and the distance separating this, create a binary */
371 /* bipartite motif. */
377 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
381 motif <<= sizeof( uchar ) * BITS_IN_BYTE;
389 size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff )
391 /* Martin A. Hansen, September 2008 */
393 /* Iterates an array of bipartite scan results */
394 /* determining how many elements are above the cutoff, */
395 /* then allocate memory for a new array, copy the */
396 /* values that were above the cutoff to the new array. */
397 /* The number of elements on the new array is returned, */
398 /* and the array is replaced. */
400 uint *array = ( uint * ) *array_ppt;
401 uint *new_array = NULL;
404 size_t new_nmemb = 0;
406 for ( i = 0; i < nmemb; i++ )
408 if ( array[ i ] >= cutoff ) {
413 assert( new_nmemb > 0 );
415 new_array = count_array_new( new_nmemb );
417 for ( i = 0; i < nmemb; i++ )
419 if ( array[ i ] >= cutoff )
421 new_array[ j ] = array[ i ];
427 *array_ppt = new_array;
433 int cmp_uint_desc( const void *a, const void *b )
435 /* Martin A. Hansen, September 2008 */
437 /* Compare function for qsort of an array of unsigned ints */
438 /* to be sorted in descending order. */
440 const uint *uint_a = ( const uint *) a;
441 const uint *uint_b = ( const uint *) b;
443 return *uint_b - *uint_a;
447 void count_array_print( uint *count_array, size_t nmemb )
449 /* Martin A. Hansen, Seqptember 2008. */
451 /* Print all bipartite motifs in count_array as */
452 /* tabular output. */
458 for ( i = 0; i < nmemb; i++ )
461 count = count_array[ i ];
463 motif_print( motif, count );
468 void motif_print( uint motif, uint count )
470 /* Martin A. Hansen, September 2008 */
472 /* Converts a binary encoded bipartite motif */
473 /* into DNA and output the motif, distance and */
474 /* count seperated by tabs: */
475 /* BLOCK1 \t BLOCK2 \t DIST \t COUNT */
481 dist = ( ushort ) motif & BLOCK_MASK;
483 motif >>= sizeof( uchar ) * BITS_IN_BYTE;
485 bin2 = ( uchar ) motif;
487 motif >>= sizeof( uchar ) * BITS_IN_BYTE;
489 bin1 = ( uchar ) motif;
491 printf( "%s\t%s\t%d\t%d\n", bin2dna[ bin1 ], bin2dna[ bin2 ], dist, count );
495 void bitblock_list_print( list_sl *list )
497 /* Martin A. Hansen, September 2008 */
499 /* Debug function to print all blocks in a list. */
501 node_sl *node = NULL;
503 printf( "\nbitblock_list_print:\n" );
505 for ( node = list->first; node != NULL; node = node->next ) {
506 bitblock_print( ( bitblock * ) node->val );
511 void bitblock_print( bitblock *out )
513 /* Martin A. Hansen, September 2008 */
515 /* Debug function to print a given block. */
517 printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN );
521 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
526 fprintf( stderr, "Running tests\n" );
528 test_count_array_new();
530 test_bitblock_print();
531 test_bitblock_list_print();
533 test_filter_results();
537 fprintf( stderr, "All tests OK\n" );
541 void test_count_array_new()
543 fprintf( stderr, " Running test_count_array_new ... " );
549 array = count_array_new( nmemb );
551 for ( i = 0; i < nmemb; i++ ) {
552 assert( array[ i ] == 0 );
557 fprintf( stderr, "done.\n" );
561 void test_bitblock_new()
563 fprintf( stderr, " Running test_bitblock_new ... " );
565 bitblock *new_block = bitblock_new();
567 assert( new_block->bin == 0 );
568 assert( new_block->hasN == FALSE );
570 fprintf( stderr, "done.\n" );
574 void test_bitblock_print()
576 fprintf( stderr, " Running test_bitblock_print ... " );
578 bitblock *new_block = bitblock_new();
581 new_block->hasN = TRUE;
583 // bitblock_print( new_block );
585 fprintf( stderr, "done.\n");
589 void test_bitblock_list_print()
591 fprintf( stderr, " Running test_bitblock_list_print ... " );
593 list_sl *list = list_sl_new();
594 node_sl *node1 = node_sl_new();
595 node_sl *node2 = node_sl_new();
596 node_sl *node3 = node_sl_new();
597 bitblock *block1 = bitblock_new();
598 bitblock *block2 = bitblock_new();
599 bitblock *block3 = bitblock_new();
612 list_sl_add_beg( &list, &node1 );
613 list_sl_add_beg( &list, &node2 );
614 list_sl_add_beg( &list, &node3 );
616 // bitblock_list_print( list );
618 fprintf( stderr, "done.\n" );
624 fprintf( stderr, " Running test_scan_seq ... " );
626 //char *seq = "AAAANTCGGCTNGGGG";
627 //char *seq = "AAAATCGGCTGGGG";
628 char *seq = "AAAAAAAAAAAAAAG";
629 size_t seq_len = strlen( seq );
630 size_t nmemb = 1 << 5;
632 uint *count_array = count_array_new( nmemb );
634 scan_seq( seq, seq_len, count_array );
636 count_array_print( count_array, nmemb ); /* DEBUG */
638 fprintf( stderr, "done.\n" );
642 void test_filter_results()
644 fprintf( stderr, " Running test_filter_results ... " );
646 uint array[] = { 4, 234, 23, 43, 12, 23, 1, 34 };
647 uint *array_pt = array;
648 size_t array_nmemb = sizeof( array ) / sizeof( uint );
653 size = filter_results( &array_pt, array_nmemb, cutoff );
655 assert( size == 5 ); /* 5 elements greather than 20. */
657 for ( i = 0; i < size; i ++ ) {
658 // printf( "elem: %i\n", array_pt[ i ] );
661 fprintf( stderr, "done.\n" );
665 void test_array_sort()
667 fprintf( stderr, " Running test_array_sort ... " );
669 uint array[] = { 4, 234, 23, 43, 12, 23, 1, 34 };
670 int array_size = sizeof( array ) / sizeof( uint );
673 for ( i = 0; i < array_size; i ++ ) {
674 // printf( "elem: %i\n", array[ i ] );
677 qsort( array, array_size, sizeof( uint ), cmp_uint_desc );
679 for ( i = 0; i < array_size; i ++ ) {
680 // printf( "elem: %i\n", array[ i ] );
683 assert( array[ 0 ] == 234 ); /* 234 first in sorted array */
684 assert( array[ array_size - 1 ] == 1 ); /* 1 last in sorted array */
686 fprintf( stderr, "done.\n" );
690 static void test_blocks2motif()
692 fprintf( stderr, " Running test_blocks2motif ... " );
699 motif = blocks2motif( bin1, bin2, dist );
701 // printf( "motif: %d\n", motif );
703 fprintf( stderr, "done.\n");
707 /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */