#include "fasta.h"
#include "list.h"
-#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
-#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
-#define BITS_IN_BYTE 8 /* number of bits in one byte. */
-#define BLOCK_SPACE_MAX 256 /* maximum space between two blocks. */
-// #define COUNT_ARRAY_SIZE ( 1 << 30 ) /* size of the unsigned int count array. */
-#define COUNT_ARRAY_SIZE ( 1 << 5 ) /* size of the unsigned int count array. */
-
-#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
-#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
-#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
-#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
-
+#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
+#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
+#define BITS_IN_BYTE 8 /* number of bits in one byte. */
+#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
+#define COUNT_ARRAY_SIZE ( sizeof( uint ) * ( 1 << 30 ) ) /* size of the unsigned int count array. */
+
+#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
+#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
+#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
+#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
+
+/* Structure that will hold one tetra nucleotide block. */
struct _bitblock
{
- uchar bin;
- bool hasN;
+ uchar bin; /* Tetra nucleotide encoded binary. */
+ bool hasN; /* Flag indicating any N's in the block. */
};
typedef struct _bitblock bitblock;
+/* Byte array for fast convertion of binary blocks back to DNA. */
char *bin2dna[256] = {
"AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
"AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
"TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
};
+/* Function declarations. */
void run_scan( int argc, char *argv[] );
void print_usage();
void scan_file( char *file, seq_entry *entry, uint *count_array );
+uint *count_array_new( size_t size );
void scan_seq( char *seq, size_t seq_len, uint *count_array );
void scan_list( list_sl *list, uint *count_array );
bitblock *bitblock_new();
-void bitblock_print( bitblock *out );
-void bitblock_list_print( list_sl *list );
-uint blocks2motif( uchar bin1, uchar bin2, short dist );
+uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
void count_array_print( uint *count_array );
void motif_print( uint motif, uint count );
+void bitblock_list_print( list_sl *list );
+void bitblock_print( bitblock *out );
+/* Unit test declarations. */
static void run_tests();
+static void test_count_array_new();
static void test_bitblock_new();
static void test_bitblock_print();
static void test_bitblock_list_print();
static void test_blocks2motif();
+/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> MAIN <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
+
+
int main( int argc, char *argv[] )
{
if ( argc == 1 ) {
}
+/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> FUNCTIONS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
+
+
void print_usage()
{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Print usage and exit if no files in argument. */
+
fprintf( stderr,
"Usage: bipartite_scam <FASTA file(s)> > result.csv\n"
);
{
/* Martin A. Hansen, September 2008 */
- /* Scan a stack of files. */
+ /* For each file in argv scan the file for */
+ /* bipartite motifs and output the motifs */
+ /* and their count. */
char *file = NULL;
int i = 0;
seq_entry *entry = NULL;
uint *count_array = NULL;
- count_array = mem_get_zero( COUNT_ARRAY_SIZE );
+ count_array = count_array_new( COUNT_ARRAY_SIZE );
entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
}
+uint *count_array_new( size_t size )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Initialize a new zeroed uint array of a given byte size. */
+
+ uint *array = NULL;
+
+ array = mem_get_zero( size );
+
+ return array;
+}
+
+
void scan_file( char *file, seq_entry *entry, uint *count_array )
{
/* Martin A. Hansen, September 2008 */
- /* Scan all entries of a file in both */
- /* sense and anti-sense directions .*/
+ /* Scan all FASTA entries of a file in both */
+ /* sense and anti-sense directions. */
FILE *fp = read_open( file );
while ( fasta_get_entry( fp, &entry ) == TRUE )
{
- fprintf( stderr, " + Scanning: %s ... ", entry->seq_name );
-
- scan_seq( entry->seq, entry->seq_len, count_array );
-
- fprintf( stderr, "done.\n" );
-
- revcomp_dna( entry->seq );
-
- fprintf( stderr, " - Scanning: %s ... ", entry->seq_name );
+ fprintf( stderr, " Scanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
scan_seq( entry->seq, entry->seq_len, count_array );
}
-bitblock *bitblock_new()
-{
- /* Martin A. Hansen, September 2008 */
-
- /* Initializes a new block. */
-
- bitblock *new_block = NULL;
-
- new_block = mem_get( sizeof( bitblock ) );
-
- new_block->bin = 0;
- new_block->hasN = FALSE;
-
- return new_block;
-}
-
-
-void bitblock_print( bitblock *out )
-{
- /* Martin A. Hansen, September 2008 */
-
- /* Debug function to print a given block. */
-
- printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN );
-}
-
-
-void bitblock_list_print( list_sl *list )
-{
- /* Martin A. Hansen, September 2008 */
-
- /* Debug function to print all blocks in a list. */
-
- node_sl *node = NULL;
-
- for ( node = list->first; node != NULL; node = node->next ) {
- bitblock_print( ( bitblock * ) node->val );
- }
-}
-
-
void scan_seq( char *seq, size_t seq_len, uint *count_array )
{
/* Martin A. Hansen, September 2008 */
+
+ /* Run a sliding window over a given sequence. The window */
+ /* consists of a list where new blocks of 4 nucleotides */
+ /* are pushed onto one end while at the same time old */
+ /* blocks are popped from the other end. The number of */
+ /* in the list is determined by the maximum seperator. */
+ /* Everytime we have a full window, the window is scanned */
+ /* for motifs. */
bitblock *block = NULL;
- short b_count = 0;
- short n_count = 0;
+ size_t b_count = 0;
+ ushort n_count = 0;
size_t i = 0;
uchar bin = 0;
bool first_node = TRUE;
if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
{
// bitblock_list_print( list );
-
+
scan_list( list, count_array );
list_sl_remove_beg( &list );
{
/* Martin A. Hansen, September 2008 */
+ /* Scan a list of blocks for biparite motifs by creating */
+ /* a binary motif consisting of two blocks of 4 nucleotides */
+ /* along with the distance separating them. Motifs containing */
+ /* N's are skipped. */
+
node_sl *first_node = NULL;
node_sl *next_node = NULL;
bitblock *block1 = NULL;
bitblock *block2 = NULL;
int i = 0;
- short dist = 0;
+ ushort dist = 0;
uint motif_bin = 0;
// bitblock_list_print( list );
{
block2 = ( bitblock * ) next_node->val;
-// printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist );
+ // printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */
if ( ! block2->hasN )
{
motif_bin = blocks2motif( block1->bin, block2->bin, dist );
- // motif_print( motif_bin );
+// motif_print( motif_bin, 0 ); /* DEBUG */
count_array[ motif_bin ]++;
}
}
-uint blocks2motif( uchar bin1, uchar bin2, short dist )
+bitblock *bitblock_new()
{
/* Martin A. Hansen, September 2008 */
+ /* Initializes a new empty bitblock. */
+
+ bitblock *new_block = NULL;
+
+ new_block = mem_get( sizeof( bitblock ) );
+
+ new_block->bin = 0;
+ new_block->hasN = FALSE;
+
+ return new_block;
+}
+
+
+uint blocks2motif( uchar bin1, uchar bin2, ushort dist )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Given two binary encoded tetra nuceotide blocks, */
+ /* and the distance separating this, create a binary */
+ /* bipartite motif. */
+
uint motif = 0;
motif |= bin1;
motif |= bin2;
- motif <<= sizeof( short ) * BITS_IN_BYTE;
+ motif <<= sizeof( uchar ) * BITS_IN_BYTE;
motif |= dist;
{
/* Martin A. Hansen, Seqptember 2008. */
- /* Print all motifs in count_array as */
+ /* Print all bipartite motifs in count_array as */
/* tabular output. */
uint i = 0;
uint motif = 0;
uint count = 0;
- for ( i = 0; i < COUNT_ARRAY_SIZE; i++ )
+ for ( i = 0; i < COUNT_ARRAY_SIZE / 4; i++ )
{
motif = i;
count = count_array[ i ];
- motif_print( motif, count );
+ if ( count > 0 ) {
+ motif_print( motif, count );
+ }
}
}
{
/* Martin A. Hansen, September 2008 */
- uchar bin1 = 0;
- uchar bin2 = 0;
- short dist = 0;
+ /* Converts a binary encoded bipartite motif */
+ /* into DNA and output the motif, distance and */
+ /* count seperated by tabs: */
+ /* BLOCK1 \t BLOCK2 \t DIST \t COUNT */
+
+ uchar bin1 = 0;
+ uchar bin2 = 0;
+ ushort dist = 0;
- dist = ( short ) motif;
+ dist = ( ushort ) motif & 255;
- motif >>= sizeof( short ) * BITS_IN_BYTE;
+ motif >>= sizeof( uchar ) * BITS_IN_BYTE;
bin2 = ( uchar ) motif;
}
+void bitblock_list_print( list_sl *list )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Debug function to print all blocks in a list. */
+
+ node_sl *node = NULL;
+
+ printf( "\nbitblock_list_print:\n" );
+
+ for ( node = list->first; node != NULL; node = node->next ) {
+ bitblock_print( ( bitblock * ) node->val );
+ }
+}
+
+
+void bitblock_print( bitblock *out )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Debug function to print a given block. */
+
+ printf( "bin: %d dna: %s hasN: %d\n", out->bin, bin2dna[ ( int ) out->bin ], out->hasN );
+}
+
+
/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
void run_tests()
{
- printf( "Running tests\n" );
+ fprintf( stderr, "Running tests\n" );
+ test_count_array_new();
test_bitblock_new();
test_bitblock_print();
test_bitblock_list_print();
test_scan_seq();
test_blocks2motif();
- printf( "All tests OK\n" );
+ fprintf( stderr, "All tests OK\n" );
+}
+
+
+void test_count_array_new()
+{
+ fprintf( stderr, " Running test_count_array_new ... " );
+
+ size_t i = 0;
+ size_t size = 128;
+ uint *array = NULL;
+
+ array = count_array_new( 4 * size );
+
+ for ( i = 0; i < size; i++ ) {
+ assert( array[ i ] == 0 );
+ }
+
+ mem_free( &array );
+
+ fprintf( stderr, "done.\n" );
}
void test_bitblock_new()
{
- printf( " Running test_bitblock_new ... " );
+ fprintf( stderr, " Running test_bitblock_new ... " );
bitblock *new_block = bitblock_new();
assert( new_block->bin == 0 );
assert( new_block->hasN == FALSE );
- printf( "done.\n");
+ fprintf( stderr, "done.\n" );
}
void test_bitblock_print()
{
- printf( " Running test_bitblock_print ... " );
+ fprintf( stderr, " Running test_bitblock_print ... " );
bitblock *new_block = bitblock_new();
// bitblock_print( new_block );
- printf( "done.\n");
+ fprintf( stderr, "done.\n");
}
void test_bitblock_list_print()
{
- printf( " Running test_bitblock_list_print ... " );
+ fprintf( stderr, " Running test_bitblock_list_print ... " );
list_sl *list = list_sl_new();
node_sl *node1 = node_sl_new();
// bitblock_list_print( list );
- printf( "done.\n" );
+ fprintf( stderr, "done.\n" );
}
void test_scan_seq()
{
- printf( " Running test_scan_seq ... " );
+ fprintf( stderr, " Running test_scan_seq ... " );
//char *seq = "AAAANTCGGCTNGGGG";
- char *seq = "AAAATCGGCTGGGG";
+ //char *seq = "AAAATCGGCTGGGG";
+ char *seq = "AAAAAAAAAAAAAAA";
size_t seq_len = strlen( seq );
uint *count_array = mem_get_zero( sizeof( uint ) * ( 1 << 5 ) );
scan_seq( seq, seq_len, count_array );
- printf( "done.\n");
+ fprintf( stderr, "done.\n" );
}
static void test_blocks2motif()
{
- printf( " Running test_blocks2motif ... " );
+ fprintf( stderr, " Running test_blocks2motif ... " );
- uchar bin1 = 4;
- uchar bin2 = 3;
- short dist = 256;
- uint motif = 0;
+ uchar bin1 = 4;
+ uchar bin2 = 3;
+ ushort dist = 256;
+ uint motif = 0;
motif = blocks2motif( bin1, bin2, dist );
// printf( "motif: %d\n", motif );
- printf( "done.\n");
+ fprintf( stderr, "done.\n");
}