#include "common.h"
#include "mem.h"
+#include "filesys.h"
+#include "seq.h"
+#include "fasta.h"
#include "list.h"
-#define BLOCK_SIZE_BITS 8 /* one block holds 8 bits */
-#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides */
-#define BITS_IN_NT 2 /* two bits holds 1 nucleotide */
+#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
+#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
+#define BITS_IN_BYTE 8 /* number of bits in one byte. */
+#define BLOCK_SPACE_MAX 256 /* maximum space between two blocks. */
+// #define COUNT_ARRAY_SIZE ( 1 << 30 ) /* size of the unsigned int count array. */
+#define COUNT_ARRAY_SIZE ( 1 << 5 ) /* size of the unsigned int count array. */
-#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
-#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
-#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
-#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
+#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
+#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
+#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
+#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
-typedef unsigned char block;
+struct _bitblock
+{
+ uchar bin;
+ bool hasN;
+};
+
+typedef struct _bitblock bitblock;
-char *block2dna[256] = {
+char *bin2dna[256] = {
"AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
"AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
"ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
"TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
};
-bool block_fill( char *seq, size_t offset, block *new_block );
-void block_print( list_sl *list );
-void scan_seq( char *seq, size_t seq_len );
+void run_scan( int argc, char *argv[] );
+void print_usage();
+void scan_file( char *file, seq_entry *entry, uint *count_array );
+void scan_seq( char *seq, size_t seq_len, uint *count_array );
+void scan_list( list_sl *list, uint *count_array );
+bitblock *bitblock_new();
+void bitblock_print( bitblock *out );
+void bitblock_list_print( list_sl *list );
+uint blocks2motif( uchar bin1, uchar bin2, short dist );
+void count_array_print( uint *count_array );
+void motif_print( uint motif, uint count );
static void run_tests();
-static void test_block_fill();
+static void test_bitblock_new();
+static void test_bitblock_print();
+static void test_bitblock_list_print();
static void test_scan_seq();
+static void test_blocks2motif();
-int main()
+int main( int argc, char *argv[] )
{
+ if ( argc == 1 ) {
+ print_usage();
+ }
+
run_tests();
+ run_scan( argc, argv );
return EXIT_SUCCESS;
}
-bool block_fill( char *seq, size_t offset, block *new_block_pt )
+void print_usage()
{
- /* Martin A. Hansen, September 2008. */
+ fprintf( stderr,
+ "Usage: bipartite_scam <FASTA file(s)> > result.csv\n"
+ );
+
+ exit( EXIT_SUCCESS );
+}
+
+
+void run_scan( int argc, char *argv[] )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Scan a stack of files. */
+
+ char *file = NULL;
+ int i = 0;
+ seq_entry *entry = NULL;
+ uint *count_array = NULL;
+
+ count_array = mem_get_zero( COUNT_ARRAY_SIZE );
+
+ entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
+
+ for ( i = 1; i < argc; i++ )
+ {
+ file = argv[ i ];
+
+ fprintf( stderr, "Scanning file: %s\n", file );
+
+ scan_file( file, entry, count_array );
+
+ fprintf( stderr, "done.\n" );
+ }
+
+ fprintf( stderr, "Printing motifs: ... " );
+
+ count_array_print( count_array );
+
+ fprintf( stderr, "done.\n" );
+
+ seq_destroy( entry );
+
+ mem_free( &count_array );
+}
- /* Fills a block with sequence (converted to bits) from */
- /* a given offset. If the sequence contains non-DNA residues */
- /* the function returns FALSE. */
+
+void scan_file( char *file, seq_entry *entry, uint *count_array )
+{
+ /* Martin A. Hansen, September 2008 */
- block new_block = *new_block_pt;
- int i = 0;
+ /* Scan all entries of a file in both */
+ /* sense and anti-sense directions .*/
- for ( i = 0; i < BLOCK_SIZE_NT; i++ )
+ FILE *fp = read_open( file );
+
+ while ( fasta_get_entry( fp, &entry ) == TRUE )
{
- new_block <<= BITS_IN_NT;
+ fprintf( stderr, " + Scanning: %s ... ", entry->seq_name );
+
+ scan_seq( entry->seq, entry->seq_len, count_array );
- switch( seq[ offset + i ] )
- {
- case 'A': case 'a': add_A( new_block ); break;
- case 'T': case 't': add_T( new_block ); break;
- case 'C': case 'c': add_C( new_block ); break;
- case 'G': case 'g': add_G( new_block ); break;
- default: return FALSE;
- }
+ fprintf( stderr, "done.\n" );
+
+ revcomp_dna( entry->seq );
+
+ fprintf( stderr, " - Scanning: %s ... ", entry->seq_name );
+
+ scan_seq( entry->seq, entry->seq_len, count_array );
+
+ fprintf( stderr, "done.\n" );
}
- *new_block_pt = new_block;
+ close_stream( fp );
+}
+
+
+bitblock *bitblock_new()
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Initializes a new block. */
+
+ bitblock *new_block = NULL;
+
+ new_block = mem_get( sizeof( bitblock ) );
- return TRUE;
+ new_block->bin = 0;
+ new_block->hasN = FALSE;
+
+ return new_block;
}
-void scan_seq( char *seq, size_t seq_len )
+void bitblock_print( bitblock *out )
{
- /* Martin A. Hansen, September 2008. */
+ /* Martin A. Hansen, September 2008 */
+
+ /* Debug function to print a given block. */
+
+ printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN );
+}
- block b = 0;
- short b_count = 0;
- size_t i = 0;
- list_sl *list = NULL;
- node_sl *node_new = NULL;
- node_sl *node_old = NULL;
- bool first_node = TRUE;
- list_sl_new( &list );
+void bitblock_list_print( list_sl *list )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Debug function to print all blocks in a list. */
- node_new = mem_get( sizeof( node_sl ) );
- node_old = mem_get( sizeof( node_sl ) );
+ node_sl *node = NULL;
- while ( i < seq_len - BLOCK_SIZE_NT + 1 )
+ for ( node = list->first; node != NULL; node = node->next ) {
+ bitblock_print( ( bitblock * ) node->val );
+ }
+}
+
+
+void scan_seq( char *seq, size_t seq_len, uint *count_array )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ bitblock *block = NULL;
+ short b_count = 0;
+ short n_count = 0;
+ size_t i = 0;
+ uchar bin = 0;
+ bool first_node = TRUE;
+ node_sl *new_node = NULL;
+ node_sl *old_node = NULL;
+ list_sl *list = list_sl_new();
+
+ for ( i = 0; seq[ i ]; i++ )
{
- b <<= BITS_IN_NT;
+ bin <<= BITS_IN_NT;
switch( seq[ i ] )
{
- case 'A': case 'a': add_A( b ); b_count++; break;
- case 'T': case 't': add_T( b ); b_count++; break;
- case 'C': case 'c': add_C( b ); b_count++; break;
- case 'G': case 'g': add_G( b ); b_count++; break;
- default: b = 0; b_count = 0; break;
+ case 'A': case 'a': add_A( bin ); break;
+ case 'T': case 't': add_T( bin ); break;
+ case 'C': case 'c': add_C( bin ); break;
+ case 'G': case 'g': add_G( bin ); break;
+ default: n_count = BLOCK_SIZE_NT; break;
}
- if ( b_count >= BLOCK_SIZE_NT )
+ if ( i > BLOCK_SIZE_NT - 2 )
{
- node_new->val = "FISK";
+ b_count++;
+
+ block = bitblock_new();
+ block->bin = bin;
- if ( first_node )
+ if ( n_count > 0 )
{
- list_sl_add_beg( &list, &node_new );
+ block->hasN = TRUE;
+ n_count--;
}
- else
- {
- list_sl_add_after( &node_old, &node_new );
+
+ new_node = node_sl_new();
+ new_node->val = block;
+
+ if ( first_node ) {
+ list_sl_add_beg( &list, &new_node );
+ } else {
+ list_sl_add_after( &old_node, &new_node );
}
- node_old = node_new;
+ old_node = new_node;
first_node = FALSE;
+
+ if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
+ {
+ // bitblock_list_print( list );
+
+ scan_list( list, count_array );
+
+ list_sl_remove_beg( &list );
+ }
}
+ }
+
+ list_sl_destroy( &list );
+}
+
- i++;
+void scan_list( list_sl *list, uint *count_array )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ node_sl *first_node = NULL;
+ node_sl *next_node = NULL;
+ bitblock *block1 = NULL;
+ bitblock *block2 = NULL;
+ int i = 0;
+ short dist = 0;
+ uint motif_bin = 0;
+
+// bitblock_list_print( list );
+
+ first_node = list->first;
+
+ block1 = ( bitblock * ) first_node->val;
+
+ if ( ! block1->hasN )
+ {
+ next_node = first_node->next;
+
+ for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) {
+ next_node = next_node->next;
+ }
+
+ for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
+ {
+ block2 = ( bitblock * ) next_node->val;
+
+// printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist );
+
+ if ( ! block2->hasN )
+ {
+ motif_bin = blocks2motif( block1->bin, block2->bin, dist );
+
+ // motif_print( motif_bin );
+
+ count_array[ motif_bin ]++;
+ }
+
+ dist++;
+ }
}
+}
- block_print( list );
- list_sl_destroy( &list );
+uint blocks2motif( uchar bin1, uchar bin2, short dist )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ uint motif = 0;
+
+ motif |= bin1;
+
+ motif <<= sizeof( uchar ) * BITS_IN_BYTE;
+
+ motif |= bin2;
+
+ motif <<= sizeof( short ) * BITS_IN_BYTE;
+
+ motif |= dist;
+
+ return motif;
}
-void block_print( list_sl *list )
+void count_array_print( uint *count_array )
{
- node_sl *node = list->first;
+ /* Martin A. Hansen, Seqptember 2008. */
+
+ /* Print all motifs in count_array as */
+ /* tabular output. */
- while ( node != NULL )
+ uint i = 0;
+ uint motif = 0;
+ uint count = 0;
+
+ for ( i = 0; i < COUNT_ARRAY_SIZE; i++ )
{
- printf( "BLOCK: %s\n", ( char * ) node->val );
-
- node = node->next;
+ motif = i;
+ count = count_array[ i ];
+
+ motif_print( motif, count );
}
}
+void motif_print( uint motif, uint count )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ uchar bin1 = 0;
+ uchar bin2 = 0;
+ short dist = 0;
+
+ dist = ( short ) motif;
+
+ motif >>= sizeof( short ) * BITS_IN_BYTE;
+
+ bin2 = ( uchar ) motif;
+
+ motif >>= sizeof( uchar ) * BITS_IN_BYTE;
+
+ bin1 = ( uchar ) motif;
+
+ printf( "%s\t%s\t%d\t%d\n", bin2dna[ bin1 ], bin2dna[ bin2 ], dist, count );
+}
+
+
/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
{
printf( "Running tests\n" );
- test_block_fill();
+ test_bitblock_new();
+ test_bitblock_print();
+ test_bitblock_list_print();
test_scan_seq();
+ test_blocks2motif();
printf( "All tests OK\n" );
}
-void test_block_fill()
+void test_bitblock_new()
{
- printf( " Running test_block_fill ... " );
+ printf( " Running test_bitblock_new ... " );
- char *seq = "AAAATCGGCTA";
- size_t seq_len = strlen( seq );
- size_t offset = 0;
- block new_block = 0;
+ bitblock *new_block = bitblock_new();
- for ( offset = 0; offset < seq_len - BLOCK_SIZE_NT + 1; offset++ )
- {
- assert( ( block_fill( seq, offset, &new_block ) == TRUE ) );
+ assert( new_block->bin == 0 );
+ assert( new_block->hasN == FALSE );
-// printf( "BLOCK: %s\n", block2dna[ new_block ] );
- }
+ printf( "done.\n");
+}
+
+
+void test_bitblock_print()
+{
+ printf( " Running test_bitblock_print ... " );
+
+ bitblock *new_block = bitblock_new();
+
+ new_block->bin = 7;
+ new_block->hasN = TRUE;
+
+// bitblock_print( new_block );
printf( "done.\n");
}
+void test_bitblock_list_print()
+{
+ printf( " Running test_bitblock_list_print ... " );
+
+ list_sl *list = list_sl_new();
+ node_sl *node1 = node_sl_new();
+ node_sl *node2 = node_sl_new();
+ node_sl *node3 = node_sl_new();
+ bitblock *block1 = bitblock_new();
+ bitblock *block2 = bitblock_new();
+ bitblock *block3 = bitblock_new();
+
+ block1->bin = 0;
+ block1->hasN = TRUE;
+ block2->bin = 1;
+ block2->hasN = TRUE;
+ block3->bin = 2;
+ block3->hasN = TRUE;
+
+ node1->val = block1;
+ node2->val = block2;
+ node3->val = block3;
+
+ list_sl_add_beg( &list, &node1 );
+ list_sl_add_beg( &list, &node2 );
+ list_sl_add_beg( &list, &node3 );
+
+ // bitblock_list_print( list );
+
+ printf( "done.\n" );
+}
+
+
void test_scan_seq()
{
printf( " Running test_scan_seq ... " );
- char *seq = "AAAATCGGCTA";
- size_t seq_len = strlen( seq );
+ //char *seq = "AAAANTCGGCTNGGGG";
+ char *seq = "AAAATCGGCTGGGG";
+ size_t seq_len = strlen( seq );
+ uint *count_array = mem_get_zero( sizeof( uint ) * ( 1 << 5 ) );
+
+ scan_seq( seq, seq_len, count_array );
+
+ printf( "done.\n");
+}
+
+
+static void test_blocks2motif()
+{
+ printf( " Running test_blocks2motif ... " );
+
+ uchar bin1 = 4;
+ uchar bin2 = 3;
+ short dist = 256;
+ uint motif = 0;
+
+ motif = blocks2motif( bin1, bin2, dist );
- scan_seq( seq, seq_len );
+// printf( "motif: %d\n", motif );
printf( "done.\n");
}