From: martinahansen Date: Mon, 8 Sep 2008 05:29:45 +0000 (+0000) Subject: fixed 0 offset bug in remove_adaptor X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=0620b1aebbe9c3c546185c52744f51459121966c;p=biopieces.git fixed 0 offset bug in remove_adaptor git-svn-id: http://biopieces.googlecode.com/svn/trunk@250 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/bipartite_scan.c b/code_c/Maasha/src/bipartite_scan.c index 59076fb..d677f66 100644 --- a/code_c/Maasha/src/bipartite_scan.c +++ b/code_c/Maasha/src/bipartite_scan.c @@ -5,12 +5,14 @@ #include "fasta.h" #include "list.h" -#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */ -#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */ -#define BITS_IN_BYTE 8 /* number of bits in one byte. */ -#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */ -#define COUNT_ARRAY_SIZE ( sizeof( uint ) * ( 1 << 30 ) ) /* size of the unsigned int count array. */ - +#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides. */ +#define BITS_IN_NT 2 /* two bits holds 1 nucleotide. */ +#define BITS_IN_BYTE 8 /* number of bits in one byte. */ +#define BLOCK_SPACE_MAX 2 /* maximum space between two blocks. */ +#define BLOCK_MASK ( BLOCK_SPACE_MAX - 1 ) /* mask for printing block space. */ +#define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */ +#define CUTOFF 1 /* minimum number of motifs in output. */ + #define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ #define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ #define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ @@ -65,13 +67,14 @@ char *bin2dna[256] = { void run_scan( int argc, char *argv[] ); void print_usage(); void scan_file( char *file, seq_entry *entry, uint *count_array ); -uint *count_array_new( size_t size ); +uint *count_array_new( size_t nmemb ); void scan_seq( char *seq, size_t seq_len, uint *count_array ); void scan_list( list_sl *list, uint *count_array ); bitblock *bitblock_new(); uint blocks2motif( uchar bin1, uchar bin2, ushort dist ); +size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff ); int cmp_uint_desc( const void *a, const void *b ); -void count_array_print( uint *count_array ); +void count_array_print( uint *count_array, size_t nmemb ); void motif_print( uint motif, uint count ); void bitblock_list_print( list_sl *list ); void bitblock_print( bitblock *out ); @@ -83,6 +86,7 @@ static void test_bitblock_new(); static void test_bitblock_print(); static void test_bitblock_list_print(); static void test_scan_seq(); +static void test_filter_results(); static void test_array_sort(); static void test_blocks2motif(); @@ -133,8 +137,9 @@ void run_scan( int argc, char *argv[] ) int i = 0; seq_entry *entry = NULL; uint *count_array = NULL; + size_t new_nmemb = 0; - count_array = count_array_new( COUNT_ARRAY_SIZE ); + count_array = count_array_new( COUNT_ARRAY_NMEMB ); entry = seq_new( MAX_SEQ_NAME, MAX_SEQ ); @@ -143,22 +148,20 @@ void run_scan( int argc, char *argv[] ) file = argv[ i ]; fprintf( stderr, "Scanning file: %s\n", file ); - scan_file( file, entry, count_array ); - fprintf( stderr, "done.\n" ); } - fprintf( stderr, "Sorting motifs: ... " ); - - qsort( count_array, COUNT_ARRAY_SIZE, sizeof( uint ), cmp_uint_desc ); + fprintf( stderr, "Filtering results (cutoff=%d) ... ", CUTOFF ); + new_nmemb = filter_results( &count_array, COUNT_ARRAY_NMEMB, CUTOFF ); + fprintf( stderr, "done.\n" ); + fprintf( stderr, "Sorting motifs: ... " ); + qsort( count_array, new_nmemb, sizeof( uint ), cmp_uint_desc ); fprintf( stderr, "done.\n" ); fprintf( stderr, "Printing motifs: ... " ); - - count_array_print( count_array ); - + count_array_print( count_array, new_nmemb ); fprintf( stderr, "done.\n" ); seq_destroy( entry ); @@ -167,15 +170,17 @@ void run_scan( int argc, char *argv[] ) } -uint *count_array_new( size_t size ) +uint *count_array_new( size_t nmemb ) { /* Martin A. Hansen, September 2008 */ - /* Initialize a new zeroed uint array of a given byte size. */ + /* Initialize a new zeroed uint array of nmemb objects. */ uint *array = NULL; - array = mem_get_zero( size ); + assert( nmemb > 0 ); + + array = mem_get_zero( nmemb * sizeof( uint ) ); return array; } @@ -266,7 +271,7 @@ void scan_seq( char *seq, size_t seq_len, uint *count_array ) if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT ) { - // bitblock_list_print( list ); + // bitblock_list_print( list ); /* DEBUG */ scan_list( list, count_array ); @@ -275,6 +280,14 @@ void scan_seq( char *seq, size_t seq_len, uint *count_array ) } } + /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */ + if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT ) + { + // bitblock_list_print( list ); /* DEBUG */ + + scan_list( list, count_array ); + } + list_sl_destroy( &list ); } @@ -314,13 +327,14 @@ void scan_list( list_sl *list, uint *count_array ) { block2 = ( bitblock * ) next_node->val; - // printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */ +// printf( "block1: %s block2: %s dist: %d\n", bin2dna[ block1->bin ], bin2dna[ block2->bin ], dist ); /* DEBUG */ if ( ! block2->hasN ) { motif_bin = blocks2motif( block1->bin, block2->bin, dist ); -// motif_print( motif_bin, 0 ); /* DEBUG */ + // motif_print( motif_bin, 0 ); /* DEBUG */ + // bitblock_list_print( list ); /* DEBUG */ count_array[ motif_bin ]++; } @@ -372,6 +386,50 @@ uint blocks2motif( uchar bin1, uchar bin2, ushort dist ) } +size_t filter_results( uint **array_ppt, size_t nmemb, uint cutoff ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Iterates an array of bipartite scan results */ + /* determining how many elements are above the cutoff, */ + /* then allocate memory for a new array, copy the */ + /* values that were above the cutoff to the new array. */ + /* The number of elements on the new array is returned, */ + /* and the array is replaced. */ + + uint *array = ( uint * ) *array_ppt; + uint *new_array = NULL; + size_t i = 0; + size_t j = 0; + size_t new_nmemb = 0; + + for ( i = 0; i < nmemb; i++ ) + { + if ( array[ i ] >= cutoff ) { + new_nmemb++; + } + } + + assert( new_nmemb > 0 ); + + new_array = count_array_new( new_nmemb ); + + for ( i = 0; i < nmemb; i++ ) + { + if ( array[ i ] >= cutoff ) + { + new_array[ j ] = array[ i ]; + + j++; + } + } + + *array_ppt = new_array; + + return new_nmemb; +} + + int cmp_uint_desc( const void *a, const void *b ) { /* Martin A. Hansen, September 2008 */ @@ -386,7 +444,7 @@ int cmp_uint_desc( const void *a, const void *b ) } -void count_array_print( uint *count_array ) +void count_array_print( uint *count_array, size_t nmemb ) { /* Martin A. Hansen, Seqptember 2008. */ @@ -397,14 +455,12 @@ void count_array_print( uint *count_array ) uint motif = 0; uint count = 0; - for ( i = 0; i < COUNT_ARRAY_SIZE / 4; i++ ) + for ( i = 0; i < nmemb; i++ ) { motif = i; count = count_array[ i ]; - if ( count > 0 ) { - motif_print( motif, count ); - } + motif_print( motif, count ); } } @@ -422,7 +478,7 @@ void motif_print( uint motif, uint count ) uchar bin2 = 0; ushort dist = 0; - dist = ( ushort ) motif & 255; + dist = ( ushort ) motif & BLOCK_MASK; motif >>= sizeof( uchar ) * BITS_IN_BYTE; @@ -474,6 +530,7 @@ void run_tests() test_bitblock_print(); test_bitblock_list_print(); test_scan_seq(); + test_filter_results(); test_array_sort(); test_blocks2motif(); @@ -486,12 +543,12 @@ void test_count_array_new() fprintf( stderr, " Running test_count_array_new ... " ); size_t i = 0; - size_t size = 128; + size_t nmemb = 128; uint *array = NULL; - array = count_array_new( 4 * size ); + array = count_array_new( nmemb ); - for ( i = 0; i < size; i++ ) { + for ( i = 0; i < nmemb; i++ ) { assert( array[ i ] == 0 ); } @@ -568,12 +625,39 @@ void test_scan_seq() //char *seq = "AAAANTCGGCTNGGGG"; //char *seq = "AAAATCGGCTGGGG"; - char *seq = "AAAAAAAAAAAAAAA"; + char *seq = "AAAAAAAAAAAAAAG"; size_t seq_len = strlen( seq ); - uint *count_array = mem_get_zero( sizeof( uint ) * ( 1 << 5 ) ); + size_t nmemb = 1 << 5; + + uint *count_array = count_array_new( nmemb ); scan_seq( seq, seq_len, count_array ); + count_array_print( count_array, nmemb ); /* DEBUG */ + + fprintf( stderr, "done.\n" ); +} + + +void test_filter_results() +{ + fprintf( stderr, " Running test_filter_results ... " ); + + uint array[] = { 4, 234, 23, 43, 12, 23, 1, 34 }; + uint *array_pt = array; + size_t array_nmemb = sizeof( array ) / sizeof( uint ); + uint cutoff = 20; + size_t size = 0; + uint i = 0; + + size = filter_results( &array_pt, array_nmemb, cutoff ); + + assert( size == 5 ); /* 5 elements greather than 20. */ + + for ( i = 0; i < size; i ++ ) { +// printf( "elem: %i\n", array_pt[ i ] ); + } + fprintf( stderr, "done.\n" ); } diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 044d851..041cbb2 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -4697,7 +4697,14 @@ sub script_remove_adaptor my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); $max_mismatch = $options->{ "mismatches" } || 0; - $offset = $options->{ "offset" } || 15; + $offset = $options->{ "offset" }; + + if ( not defined $offset ) { + $offset = 0; + } else { + $offset--; + } + $adaptor = $options->{ "adaptor" }; $adaptor_len = length $adaptor; $adaptor = [ split //, uc $adaptor ];