From 60151059a5f4b21ba091d3c2122dc300d2eadafa Mon Sep 17 00:00:00 2001 From: martinahansen Date: Fri, 22 Aug 2008 01:33:23 +0000 Subject: [PATCH] added option to read_soft for selecting specific samples git-svn-id: http://biopieces.googlecode.com/svn/trunk@218 74ccb610-7750-0410-82ae-013aeee3265d --- code_c/Maasha/src/Makefile | 6 +- code_c/Maasha/src/gmon.out | Bin 733825 -> 733145 bytes code_c/Maasha/src/repeat-O-matic.c | 299 ++++++++++++++--------------- code_c/Maasha/src/test/test_seq.c | 60 ++++++ code_c/Maasha/src/tetra_count.c | 120 ++++++++++++ code_perl/Maasha/Biopieces.pm | 6 + 6 files changed, 333 insertions(+), 158 deletions(-) create mode 100644 code_c/Maasha/src/test/test_seq.c create mode 100644 code_c/Maasha/src/tetra_count.c diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 25191a5..0ab1de8 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -9,7 +9,7 @@ TEST_DIR = test/ INC = -I $(INC_DIR) LIB = -lm $(LIB_DIR)*.o -all: libs utest fasta_count repeat-O-matic +all: libs utest fasta_count repeat-O-matic tetra_count libs: cd $(LIB_DIR) && ${MAKE} all @@ -23,8 +23,12 @@ fasta_count: fasta_count.c repeat-O-matic: repeat-O-matic.c $(CC) $(Cflags) $(INC) $(LIB) repeat-O-matic.c -o repeat-O-matic +tetra_count: tetra_count.c + $(CC) $(Cflags) $(INC) $(LIB) tetra_count.c -o tetra_count + clean: cd $(LIB_DIR) && ${MAKE} clean cd $(TEST_DIR) && ${MAKE} clean rm fasta_count rm repeat-O-matic + rm tetra_count diff --git a/code_c/Maasha/src/gmon.out b/code_c/Maasha/src/gmon.out index 7cfc958a65aaa3536e0993576eb3bccf1ceed374..69f3a07b9196260c4aedc4f41ea93f29b8398859 100644 GIT binary patch delta 135 zcmZpCs&n(c&V*P#6FCNk0wA6d#lT=9w=r8w&Wf3Vfk6R?8{`=n+9DYk7=dgMUI1is v0F{B*Oi(dQ?ad4A+ZWn10x=U1GXpUT5VHa?8xXStF$WNHZeM86_23f#Ps$)* delta 385 zcmZvYy=nqc5Jo?L*1s5C`$R=jg)Jno-4$Uz1Y23a8p2X diff --git a/code_c/Maasha/src/repeat-O-matic.c b/code_c/Maasha/src/repeat-O-matic.c index 392964c..dcfa1dd 100644 --- a/code_c/Maasha/src/repeat-O-matic.c +++ b/code_c/Maasha/src/repeat-O-matic.c @@ -17,100 +17,107 @@ #include "seq.h" #include "fasta.h" + // #define OLIGO_SIZE 15 #define OLIGO_SIZE 5 #define SIZE ( 1 << ( OLIGO_SIZE * 2 ) ) -#define UINT_BITS 32 -#define T 3 /* 11 on the rightmost two bits of bin. */ -#define C 1 /* 01 on the rightmost two bits of bin. */ -#define G 2 /* 10 on the rightmost two bits of bin. */ +#define T 3 /* 11 on the rightmost two bits of bin. */ +#define C 1 /* 01 on the rightmost two bits of bin. */ +#define G 2 /* 10 on the rightmost two bits of bin. */ + -uint *oligo_count( char *path ); uint mask_create( int oligo_size ); -void oligo_count_output( char *path, uint *array ); +void oligo_count( char *path, uint **array_ppt, uint mask ); +void oligo_count_output( char *path, uint *array, uint mask ); void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ); + int main( int argc, char *argv[] ) { char *path = NULL; + uint mask = 0; uint *array = NULL; - path = argv[ 1 ]; + path = argv[ 1 ]; + + mask = mask_create( OLIGO_SIZE ); - array = oligo_count( path ); + oligo_count( path, &array, mask ); - //oligo_count_output( path, array ); + oligo_count_output( path, array, mask ); return 0; } -uint *oligo_count( char *path ) +uint mask_create( int oligo_size ) { /* Martin A. Hansen, June 2008 */ - /* Count the occurence of all oligos of a fixed size in a FASTA file. */ + /* Create a bit mask for binary encoded oligos less than 32 bits. */ - uint *array = NULL; - uint i = 0; - uint mask = 0; - uint bin = 0; - uint bin_rc1 = 0; - uint bin_rc2 = 0; - uint j = 0; - uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */ - uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */ - uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */ - seq_entry *entry = NULL; - FILE *fp = NULL; + uint i = 0; + uint mask = 0; - array = mem_get_zero( sizeof( uint ) * SIZE ); + for ( i = 0; i < oligo_size; i++ ) + { + mask <<= 2; + + mask |= 3; /* add 11 to mask */ + } + + return mask; +} - mask = mask_create( OLIGO_SIZE ); - entry = mem_get( sizeof( entry ) ); +void oligo_count( char *path, uint **array_ppt, uint mask ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Count the occurence of all oligos of a fixed size in a FASTA file. */ + + uint *array = *array_ppt; + uint i = 0; + uint j = 0; + uint bin = 0; + seq_entry *entry = NULL; + FILE *fp = NULL; + + array = mem_get_zero( sizeof( uint ) * SIZE ); fp = read_open( path ); - while ( ( fasta_get_entry( fp, &entry ) ) ) + seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ ); + + while ( ( fasta_get_entry( fp, &entry ) ) != 0 ) { fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name ); - bin = 0; - bin_rc1 = 0; - j = 0; + bin = 0; + j = 0; for ( i = 0; entry->seq[ i ]; i++ ) { - bin <<= 2; - bin_rc1 >>= 2; + bin <<= 2; switch( entry->seq[ i ] ) { - case 'A': case 'a': bin_rc1 |= A_rc; j++; break; - case 'T': case 't': bin |= T; j++; break; - case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break; - case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break; - default: bin = 0; bin_rc1 = 0; j = 0; break; + case 'A': case 'a': j++; break; + case 'T': case 't': bin |= T; j++; break; + case 'C': case 'c': bin |= C; j++; break; + case 'G': case 'g': bin |= G; j++; break; + default: bin = 0; j = 0; break; } if ( j >= OLIGO_SIZE ) { array[ ( bin & mask ) ]++; - - bin_rc2 = bin_rc1; - - bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) ); - - array[ ( bin_rc2 ) ]++; /* printf( "\n" ); printf( "mask : %s\n", bits2string( mask ) ); printf( "bin : %s\n", bits2string( bin ) ); printf( "bin & mask: %s\n", bits2string( bin & mask ) ); - printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) ); - printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) ); */ } } @@ -124,132 +131,108 @@ uint *oligo_count( char *path ) free( entry->seq ); entry = NULL; - return array; + *array_ppt = array; } -uint mask_create( int oligo_size ) +void oligo_count_output( char *path, uint *array, uint mask ) { /* Martin A. Hansen, June 2008 */ - /* Create a bit mask for binary encode oligos less than sizeof( uint ). */ + /* Output oligo count for each sequence position. */ - uint i; - uint mask; + uint i; + uint j; + uint bin; + int count; + uint *block; + uint block_pos; + uint block_beg; + uint block_size; + uint chr_pos; + seq_entry *entry; + FILE *fp; - mask = 0; + seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ ); - for ( i = 0; i < oligo_size; i++ ) + fp = read_open( path ); + + while ( ( fasta_get_entry( fp, &entry ) ) != 0 ) { - mask <<= 2; + fprintf( stderr, "Writing results for: %s ... ", entry->seq_name ); - mask |= 3; - } + bin = 0; + j = 0; + block_pos = 0; + block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ); + block = mem_get_zero( block_size ); - return mask; -} + for ( i = 0; entry->seq[ i ]; i++ ) + { + bin <<= 2; + switch( entry->seq[ i ] ) + { + case 'A': case 'a': j++; break; + case 'T': case 't': bin |= T; j++; break; + case 'C': case 'c': bin |= C; j++; break; + case 'G': case 'g': bin |= G; j++; break; + default: bin = 0; j = 0; break; + } -//void oligo_count_output( char *path, uint *array ) -//{ -// /* Martin A. Hansen, June 2008 */ -// -// /* Output oligo count for each sequence position. */ -// -// struct seq_entry *entry; -// FILE *fp; -// uint mask; -// uint i; -// uint j; -// uint bin; -// int count; -// uint *block; -// uint block_pos; -// uint block_beg; -// uint block_size; -// uint chr_pos; -// file_buffer *buffer; -// -// mask = mask_create( OLIGO_SIZE ); -// -// entry = mem_get( sizeof( entry ) ); -// -// fp = read_open( path ); -// -// while ( ( fasta_get_entry( buffer, &entry ) ) ) -// { -// fprintf( stderr, "Writing results for: %s ... ", entry->seq_name ); -// -// bin = 0; -// j = 0; -// block_pos = 0; -// block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ); -// block = mem_get_zero( block_size ); -// -// for ( i = 0; entry->seq[ i ]; i++ ) -// { -// bin <<= 2; -// -// switch( entry->seq[ i ] ) -// { -// case 'A': case 'a': j++; break; -// case 'T': case 't': bin |= T; j++; break; -// case 'C': case 'c': bin |= C; j++; break; -// case 'G': case 'g': bin |= G; j++; break; -// default: bin = 0; j = 0; break; -// } -// -// if ( j >= OLIGO_SIZE ) -// { -// count = array[ ( bin & mask ) ]; -// -// if ( count > 1 ) -// { -// chr_pos = i - OLIGO_SIZE + 1; -// -// if ( block_pos == 0 ) -// { -// memset( block, '\0', block_size ); -// -// block_beg = chr_pos; -// -// block[ block_pos ] = count; -// -// block_pos++; -// } -// else -// { -// if ( chr_pos > block_beg + block_pos ) -// { -// fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); -// -// block_pos = 0; -// } -// else -// { -// block[ block_pos ] = count; -// -// block_pos++; -// } -// } -// } -// } -// } -// -// if ( block_pos > 0 ) -// { -// fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); -// -// mem_free( ( void * ) &block ); -// } -// -// fprintf( stderr, "done.\n" ); -// } -// -// close_stream( fp ); -// -// fasta_free_entry( entry ); -//} + if ( j >= OLIGO_SIZE ) + { + count = array[ ( bin & mask ) ]; + + if ( count > 1 ) + { + chr_pos = i - OLIGO_SIZE + 1; + + if ( block_pos == 0 ) + { + memset( block, '\0', block_size ); + + block_beg = chr_pos; + + block[ block_pos ] = count; + + block_pos++; + } + else + { + if ( chr_pos > block_beg + block_pos ) + { + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + + block_pos = 0; + } + else + { + block[ block_pos ] = count; + + block_pos++; + } + } + } + } + } + + if ( block_pos > 0 ) + { + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + + mem_free( ( void * ) &block ); + } + + fprintf( stderr, "done.\n" ); + } + + free( entry->seq_name ); + free( entry->seq ); + entry = NULL; + + close_stream( fp ); +} void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ) @@ -271,3 +254,5 @@ void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, } } } + + diff --git a/code_c/Maasha/src/test/test_seq.c b/code_c/Maasha/src/test/test_seq.c new file mode 100644 index 0000000..db6d7bb --- /dev/null +++ b/code_c/Maasha/src/test/test_seq.c @@ -0,0 +1,60 @@ +#include "common.h" +#include "seq.h" + +static void test_seq_new(); +static void test_seq_destroy(); + + +int main() +{ + fprintf( stderr, "Running all tests for seq.c\n" ); + + test_seq_new(); + test_seq_destroy(); + + fprintf( stderr, "Done\n\n" ); + + return EXIT_SUCCESS; +} + + +void test_seq_new() +{ + fprintf( stderr, " Testing seq_new ... " ); + + seq_entry *entry; + size_t max_seq_name = MAX_SEQ_NAME; + size_t max_seq = MAX_SEQ; + + seq_new( &entry, max_seq_name, max_seq ); + + assert( entry->seq_name != NULL ); + assert( entry->seq != NULL ); + assert( entry->seq_len == 0 ); + + seq_destroy( entry ); + + fprintf( stderr, "OK\n" ); +} + + +void test_seq_destroy() +{ + fprintf( stderr, " Testing seq_destroy ... " ); + + seq_entry *entry; + size_t max_seq_name = MAX_SEQ_NAME; + size_t max_seq = MAX_SEQ; + + seq_new( &entry, max_seq_name, max_seq ); + + assert( entry->seq_name != NULL ); + assert( entry->seq != NULL ); + assert( entry->seq_len == 0 ); + + seq_destroy( entry ); + + fprintf( stderr, "OK\n" ); +} + + diff --git a/code_c/Maasha/src/tetra_count.c b/code_c/Maasha/src/tetra_count.c new file mode 100644 index 0000000..60de794 --- /dev/null +++ b/code_c/Maasha/src/tetra_count.c @@ -0,0 +1,120 @@ +#include "common.h" +#include "filesys.h" +#include "mem.h" +#include "seq.h" +#include "fasta.h" + +#define BLOCK32 15 +#define ARRAY_SIZE ( 1 << ( BLOCK32 * 2 ) ) + +#define BLOCK_SIZE 4 +#define MAX_SPACE 256 +#define T 3 /* 11 on the rightmost two bits of bin. */ +#define C 1 /* 01 on the rightmost two bits of bin. */ +#define G 2 /* 10 on the rightmost two bits of bin. */ + + +uint mask_create( int size ); +void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask ); + + +int main( int argc, char *argv[] ) +{ + uint mask = 0; + int i = 0; + uint *array = NULL; + seq_entry *entry = NULL; + + mask = mask_create( BLOCK_SIZE ); + + array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE ); + + seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ ); + + printf( "mask: %d\n", mask ); + + for ( i = 1; i < argc; i++ ) { + block_count( argv[ i ], &array, &entry, mask ); + } + + return EXIT_SUCCESS; +} + + +uint mask_create( int size ) +{ + /* Martin A. Hansen, June 2008 */ + + /* Create a bit mask. */ + + uint i = 0; + uint mask = 0; + + for ( i = 0; i < size; i++ ) + { + mask <<= 2; + + mask |= 3; /* add 11 to mask */ + } + + return mask; +} + + +void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask ) +{ + /* Martin A. Hansen, August 2008 */ + + /* Scan a FASTA file for ... */ + + uint *array = *array_ppt; + seq_entry *entry = *entry_ppt; + FILE *fp = NULL; + uint i = 0; + uint j = 0; + uint bin1 = 0; + uint bin2 = 0; + + fp = read_open( path ); + + fprintf( stderr, "Processing: %s\n", path ); + + while ( ( fasta_get_entry( fp, &entry ) ) != 0 ) + { + fprintf( stderr, " Counting blocks in: %s ... ", entry->seq_name ); + + bin1 = 0; + j = 0; + + for ( i = 0; entry->seq[ i ]; i++ ) + { + bin <<= 2; + + switch( entry->seq[ i ] ) + { + case 'A': case 'a': j++; break; + case 'T': case 't': bin |= T; j++; break; + case 'C': case 'c': bin |= C; j++; break; + case 'G': case 'g': bin |= G; j++; break; + default: bin = 0; j = 0; break; + } + + if ( j >= BLOCK32 ) + { + array[ ( bin & mask ) ]++; +/* + printf( "\n" ); + printf( "mask : %s\n", bits2string( mask ) ); + printf( "bin : %s\n", bits2string( bin ) ); + printf( "bin & mask: %s\n", bits2string( bin & mask ) ); +*/ + } + } + + fprintf( stderr, "done.\n" ); + } + + fprintf( stderr, "done.\n" ); + + close_stream( fp ); +} diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 8fdce8b..11c8020 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -366,6 +366,7 @@ sub get_options { @options = qw( data_in|i=s + samples|s=s num|n=s ); } @@ -964,6 +965,7 @@ sub get_options $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; + $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; # ---- check arguments ---- @@ -1758,6 +1760,10 @@ sub script_read_soft foreach $sample ( @samples ) { + if ( $options->{ "samples" } ) { + next if grep { $sample->[ 0 ] !~ $_ } @{ $options->{ "samples" } }; + } + $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->[ 1 ] - $old_end - 1, $sample->[ 2 ] - $old_end - 1 ); foreach $record ( @{ $records } ) -- 2.39.5