#include "fasta.h"
#include "list.h"
-#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
-#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
-#define BITS_IN_BYTE 8 /* number of bits in one byte. */
-#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
-#define COUNT_ARRAY_SIZE ( sizeof( uint ) * ( 1 << 30 ) ) /* size of the unsigned int count array. */
-
+#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */
+#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */
+#define BITS_IN_BYTE 8 /* number of bits in one byte. */
+#define BLOCK_SPACE_MAX 2 /* maximum space between two blocks. */
+#define BLOCK_MASK ( BLOCK_SPACE_MAX - 1 ) /* mask for printing block space. */
+#define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
+#define CUTOFF 1 /* minimum number of motifs in output. */
+
#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
void run_scan( int argc, char *argv[] );
void print_usage();
void scan_file( char *file, seq_entry *entry, uint *count_array );
-uint *count_array_new( size_t size );
+uint *count_array_new( size_t nmemb );
void scan_seq( char *seq, size_t seq_len, uint *count_array );
void scan_list( list_sl *list, uint *count_array );
bitblock *bitblock_new();
uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
+size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff );
int cmp_uint_desc( const void *a, const void *b );
-void count_array_print( uint *count_array );
+void count_array_print( uint *count_array, size_t nmemb );
void motif_print( uint motif, uint count );
void bitblock_list_print( list_sl *list );
void bitblock_print( bitblock *out );
static void test_bitblock_print();
static void test_bitblock_list_print();
static void test_scan_seq();
+static void test_filter_results();
static void test_array_sort();
static void test_blocks2motif();
int i = 0;
seq_entry *entry = NULL;
uint *count_array = NULL;
+ size_t new_nmemb = 0;
- count_array = count_array_new( COUNT_ARRAY_SIZE );
+ count_array = count_array_new( COUNT_ARRAY_NMEMB );
entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
file = argv[ i ];
fprintf( stderr, "Scanning file: %s\n", file );
-
scan_file( file, entry, count_array );
-
fprintf( stderr, "done.\n" );
}
- fprintf( stderr, "Sorting motifs: ... " );
-
- qsort( count_array, COUNT_ARRAY_SIZE, sizeof( uint ), cmp_uint_desc );
+ fprintf( stderr, "Filtering results (cutoff=%d) ... ", CUTOFF );
+ new_nmemb = filter_results( &count_array, COUNT_ARRAY_NMEMB, CUTOFF );
+ fprintf( stderr, "done.\n" );
+ fprintf( stderr, "Sorting motifs: ... " );
+ qsort( count_array, new_nmemb, sizeof( uint ), cmp_uint_desc );
fprintf( stderr, "done.\n" );
fprintf( stderr, "Printing motifs: ... " );
-
- count_array_print( count_array );
-
+ count_array_print( count_array, new_nmemb );
fprintf( stderr, "done.\n" );
seq_destroy( entry );
}
-uint *count_array_new( size_t size )
+uint *count_array_new( size_t nmemb )
{
/* Martin A. Hansen, September 2008 */
- /* Initialize a new zeroed uint array of a given byte size. */
+ /* Initialize a new zeroed uint array of nmemb objects. */
uint *array = NULL;
- array = mem_get_zero( size );
+ assert( nmemb > 0 );
+
+ array = mem_get_zero( nmemb * sizeof( uint ) );
return array;
}
if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
{
- // bitblock_list_print( list );
+ // bitblock_list_print( list ); /* DEBUG */
scan_list( list, count_array );
}
}
+ /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
+ if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
+ {
+ // bitblock_list_print( list ); /* DEBUG */
+
+ scan_list( list, count_array );
+ }
+
list_sl_destroy( &list );
}
{
block2 = ( bitblock * ) next_node->val;
- // printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */
+// printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */
if ( ! block2->hasN )
{
motif_bin = blocks2motif( block1->bin, block2->bin, dist );
-// motif_print( motif_bin, 0 ); /* DEBUG */
+ // motif_print( motif_bin, 0 ); /* DEBUG */
+ // bitblock_list_print( list ); /* DEBUG */
count_array[ motif_bin ]++;
}
}
+size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Iterates an array of bipartite scan results */
+ /* determining how many elements are above the cutoff, */
+ /* then allocate memory for a new array, copy the */
+ /* values that were above the cutoff to the new array. */
+ /* The number of elements on the new array is returned, */
+ /* and the array is replaced. */
+
+ uint *array = ( uint * ) *array_ppt;
+ uint *new_array = NULL;
+ size_t i = 0;
+ size_t j = 0;
+ size_t new_nmemb = 0;
+
+ for ( i = 0; i < nmemb; i++ )
+ {
+ if ( array[ i ] >= cutoff ) {
+ new_nmemb++;
+ }
+ }
+
+ assert( new_nmemb > 0 );
+
+ new_array = count_array_new( new_nmemb );
+
+ for ( i = 0; i < nmemb; i++ )
+ {
+ if ( array[ i ] >= cutoff )
+ {
+ new_array[ j ] = array[ i ];
+
+ j++;
+ }
+ }
+
+ *array_ppt = new_array;
+
+ return new_nmemb;
+}
+
+
int cmp_uint_desc( const void *a, const void *b )
{
/* Martin A. Hansen, September 2008 */
}
-void count_array_print( uint *count_array )
+void count_array_print( uint *count_array, size_t nmemb )
{
/* Martin A. Hansen, Seqptember 2008. */
uint motif = 0;
uint count = 0;
- for ( i = 0; i < COUNT_ARRAY_SIZE / 4; i++ )
+ for ( i = 0; i < nmemb; i++ )
{
motif = i;
count = count_array[ i ];
- if ( count > 0 ) {
- motif_print( motif, count );
- }
+ motif_print( motif, count );
}
}
uchar bin2 = 0;
ushort dist = 0;
- dist = ( ushort ) motif & 255;
+ dist = ( ushort ) motif & BLOCK_MASK;
motif >>= sizeof( uchar ) * BITS_IN_BYTE;
test_bitblock_print();
test_bitblock_list_print();
test_scan_seq();
+ test_filter_results();
test_array_sort();
test_blocks2motif();
fprintf( stderr, " Running test_count_array_new ... " );
size_t i = 0;
- size_t size = 128;
+ size_t nmemb = 128;
uint *array = NULL;
- array = count_array_new( 4 * size );
+ array = count_array_new( nmemb );
- for ( i = 0; i < size; i++ ) {
+ for ( i = 0; i < nmemb; i++ ) {
assert( array[ i ] == 0 );
}
//char *seq = "AAAANTCGGCTNGGGG";
//char *seq = "AAAATCGGCTGGGG";
- char *seq = "AAAAAAAAAAAAAAA";
+ char *seq = "AAAAAAAAAAAAAAG";
size_t seq_len = strlen( seq );
- uint *count_array = mem_get_zero( sizeof( uint ) * ( 1 << 5 ) );
+ size_t nmemb = 1 << 5;
+
+ uint *count_array = count_array_new( nmemb );
scan_seq( seq, seq_len, count_array );
+ count_array_print( count_array, nmemb ); /* DEBUG */
+
+ fprintf( stderr, "done.\n" );
+}
+
+
+void test_filter_results()
+{
+ fprintf( stderr, " Running test_filter_results ... " );
+
+ uint array[] = { 4, 234, 23, 43, 12, 23, 1, 34 };
+ uint *array_pt = array;
+ size_t array_nmemb = sizeof( array ) / sizeof( uint );
+ uint cutoff = 20;
+ size_t size = 0;
+ uint i = 0;
+
+ size = filter_results( &array_pt, array_nmemb, cutoff );
+
+ assert( size == 5 ); /* 5 elements greather than 20. */
+
+ for ( i = 0; i < size; i ++ ) {
+// printf( "elem: %i\n", array_pt[ i ] );
+ }
+
fprintf( stderr, "done.\n" );
}