From: martinahansen Date: Wed, 3 Sep 2008 05:26:07 +0000 (+0000) Subject: added calc_fixedstep biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=97dbe2e589c9f7b57d055f6940e8ee06c8f32f31;p=biopieces.git added calc_fixedstep biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@240 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_bin/calc_fixedstep b/bp_bin/calc_fixedstep new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/calc_fixedstep @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 0ab1de8..efed4f9 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -9,7 +9,7 @@ TEST_DIR = test/ INC = -I $(INC_DIR) LIB = -lm $(LIB_DIR)*.o -all: libs utest fasta_count repeat-O-matic tetra_count +all: libs utest bipartite_scan fasta_count repeat-O-matic tetra_count libs: cd $(LIB_DIR) && ${MAKE} all @@ -17,6 +17,9 @@ libs: utest: cd $(TEST_DIR) && ${MAKE} all +bipartite_scan: bipartite_scan.c + $(CC) $(Cflags) $(INC) $(LIB) bipartite_scan.c -o bipartite_scan + fasta_count: fasta_count.c $(CC) $(Cflags) $(INC) $(LIB) fasta_count.c -o fasta_count @@ -29,6 +32,7 @@ tetra_count: tetra_count.c clean: cd $(LIB_DIR) && ${MAKE} clean cd $(TEST_DIR) && ${MAKE} clean + rm bipartite_scan rm fasta_count rm repeat-O-matic rm tetra_count diff --git a/code_c/Maasha/src/bipartite_scan.c b/code_c/Maasha/src/bipartite_scan.c new file mode 100644 index 0000000..05078bd --- /dev/null +++ b/code_c/Maasha/src/bipartite_scan.c @@ -0,0 +1,218 @@ +#include "common.h" +#include "mem.h" +#include "list.h" + +#define BLOCK_SIZE_BITS 8 /* one block holds 8 bits */ +#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides */ +#define BITS_IN_NT 2 /* two bits holds 1 nucleotide */ + +#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ +#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ +#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ +#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ + +typedef unsigned char block; + +char *block2dna[256] = { + "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT", + "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT", + "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT", + "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT", + "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT", + "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT", + "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT", + "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT", + "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT", + "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT", + "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT", + "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT", + "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT", + "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT", + "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT", + "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT", + "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT", + "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT", + "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT", + "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT", + "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT", + "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT", + "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT", + "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT", + "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT", + "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT", + "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT", + "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT", + "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT", + "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT", + "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT", + "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT" +}; + +bool block_fill( char *seq, size_t offset, block *new_block ); +void block_print( list_sl *list ); +void scan_seq( char *seq, size_t seq_len ); + +static void run_tests(); +static void test_block_fill(); +static void test_scan_seq(); + + +int main() +{ + run_tests(); + + return EXIT_SUCCESS; +} + + +bool block_fill( char *seq, size_t offset, block *new_block_pt ) +{ + /* Martin A. Hansen, September 2008. */ + + /* Fills a block with sequence (converted to bits) from */ + /* a given offset. If the sequence contains non-DNA residues */ + /* the function returns FALSE. */ + + block new_block = *new_block_pt; + int i = 0; + + for ( i = 0; i < BLOCK_SIZE_NT; i++ ) + { + new_block <<= BITS_IN_NT; + + switch( seq[ offset + i ] ) + { + case 'A': case 'a': add_A( new_block ); break; + case 'T': case 't': add_T( new_block ); break; + case 'C': case 'c': add_C( new_block ); break; + case 'G': case 'g': add_G( new_block ); break; + default: return FALSE; + } + } + + *new_block_pt = new_block; + + return TRUE; +} + + +void scan_seq( char *seq, size_t seq_len ) +{ + /* Martin A. Hansen, September 2008. */ + + block b = 0; + short b_count = 0; + size_t i = 0; + list_sl *list = NULL; + node_sl *node_new = NULL; + node_sl *node_old = NULL; + bool first_node = TRUE; + + list_sl_new( &list ); + + node_new = mem_get( sizeof( node_sl ) ); + node_old = mem_get( sizeof( node_sl ) ); + + while ( i < seq_len - BLOCK_SIZE_NT + 1 ) + { + b <<= BITS_IN_NT; + + switch( seq[ i ] ) + { + case 'A': case 'a': add_A( b ); b_count++; break; + case 'T': case 't': add_T( b ); b_count++; break; + case 'C': case 'c': add_C( b ); b_count++; break; + case 'G': case 'g': add_G( b ); b_count++; break; + default: b = 0; b_count = 0; break; + } + + if ( b_count >= BLOCK_SIZE_NT ) + { + node_new->val = "FISK"; + + if ( first_node ) + { + list_sl_add_beg( &list, &node_new ); + } + else + { + list_sl_add_after( &node_old, &node_new ); + } + + node_old = node_new; + + first_node = FALSE; + } + + i++; + } + + block_print( list ); + + list_sl_destroy( &list ); +} + + +void block_print( list_sl *list ) +{ + node_sl *node = list->first; + + while ( node != NULL ) + { + printf( "BLOCK: %s\n", ( char * ) node->val ); + + node = node->next; + } +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ + + +void run_tests() +{ + printf( "Running tests\n" ); + + test_block_fill(); + test_scan_seq(); + + printf( "All tests OK\n" ); +} + + +void test_block_fill() +{ + printf( " Running test_block_fill ... " ); + + char *seq = "AAAATCGGCTA"; + size_t seq_len = strlen( seq ); + size_t offset = 0; + block new_block = 0; + + for ( offset = 0; offset < seq_len - BLOCK_SIZE_NT + 1; offset++ ) + { + assert( ( block_fill( seq, offset, &new_block ) == TRUE ) ); + +// printf( "BLOCK: %s\n", block2dna[ new_block ] ); + } + + printf( "done.\n"); +} + + +void test_scan_seq() +{ + printf( " Running test_scan_seq ... " ); + + char *seq = "AAAATCGGCTA"; + size_t seq_len = strlen( seq ); + + scan_seq( seq, seq_len ); + + printf( "done.\n"); +} + + +/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */ + + diff --git a/code_c/Maasha/src/inc/hash.h b/code_c/Maasha/src/inc/hash.h index 5c26d34..34e1cb8 100644 --- a/code_c/Maasha/src/inc/hash.h +++ b/code_c/Maasha/src/inc/hash.h @@ -1,40 +1,46 @@ -/* Structure of a generic hash. */ -struct hash +/* Structure of a generic hash element. */ +struct _hash_elem { - struct hash_elem **table; - uint mask; - int table_size; - int elem_count; + struct _hash_elem *next; + char *key; + void *val; }; -/* Structure of a generic hash element. */ -struct hash_elem +typedef struct _hash_elem hash_elem; + +/* Structure of a generic hash. */ +struct _hash { - struct hash_elem *next; - char *key; - void *val; + hash_elem **table; + uint mask; + int table_size; + int elem_count; }; +typedef struct _hash hash; + /* Initialize a new generic hash structure. */ -struct hash *hash_new( size_t size ); +void hash_new( hash **hash_pt, size_t size ); /* Hash function that generates a hash key, */ -uint hash_key( char *string ); +uint hash_key( char *string ); /* Add a new hash element consisting of a key/value pair to an existing hash. */ -void hash_add( struct hash *myhash, char *key, void *val ); +void hash_add( hash *hash_pt, char *key, void *val ); /* Lookup a key in a given hash and return the value - or NULL if not found. */ -void *hash_get( struct hash *myhash, char *key ); +void *hash_get( hash *hash_pt, char *key ); /* Lookup a key in a given hash and return the hash element - or NULL if not found. */ -struct hash_elem *hash_get_elem( struct hash *myhash, char *key ); +hash_elem *hash_get_elem( hash *hash_pt, char *key ); /* Remove key/value pair from a given hash. Returns true if a remove was successful. */ -bool hash_del( struct hash *myhash, char *key ); +bool hash_del( hash *hash_pt, char *key ); /* Deallocate memory for hash and all hash elements. */ -void hash_destroy( struct hash *myhash ); +void hash_destroy( hash *hash_pt ); /* Output some collision stats for a given hash. */ -void hash_collision_stats( struct hash *myhash ); +void hash_collision_stats( hash *hash_pt ); + + diff --git a/code_c/Maasha/src/lib/hash.c b/code_c/Maasha/src/lib/hash.c index acdc991..e3727b2 100644 --- a/code_c/Maasha/src/lib/hash.c +++ b/code_c/Maasha/src/lib/hash.c @@ -4,14 +4,14 @@ #include "list.h" -struct hash *hash_new( size_t size ) +void hash_new( hash **hash_ppt, size_t size ) { /* Martin A. Hansen, June 2008 */ /* Initialize a new generic hash structure. */ - struct hash *new_hash; - int table_size; + hash *new_hash = *hash_ppt; + size_t table_size = 0; new_hash = mem_get( sizeof( new_hash ) ); @@ -19,179 +19,178 @@ struct hash *hash_new( size_t size ) new_hash->table_size = table_size; new_hash->mask = table_size - 1; - new_hash->table = mem_get( sizeof( struct hash_elem * ) * table_size ); - + new_hash->table = mem_get( sizeof( hash_elem * ) * table_size ); new_hash->elem_count = 0; - return new_hash; -} - - -void hash_add( struct hash *myhash, char *key, void *val ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Add a new hash element consisting of a key/value pair to an existing hash. */ - - struct hash_elem *old_elem; - struct hash_elem *new_elem; - int hash_index; - - if ( ( old_elem = hash_get_elem( myhash, key ) ) != NULL ) - { - old_elem->val = val; - } - else - { - new_elem = mem_get( sizeof( new_elem ) ); - - hash_index = ( hash_key( key ) & myhash->mask ); - - new_elem->key = mem_clone( key, strlen( key ) ); - new_elem->val = val; - new_elem->next = myhash->table[ hash_index ]; - - myhash->table[ hash_index ] = new_elem; - myhash->elem_count++; - } -} - - -void *hash_get( struct hash *myhash, char *key ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Lookup a key in a given hash and return the value - or NULL if not found. */ - - struct hash_elem *bucket; - - bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; - - while ( bucket != NULL ) - { - if ( strcmp( bucket->key, key ) == 0 ) { - return bucket->val; - } - - bucket = bucket->next; - } - - return NULL; -} - - -struct hash_elem *hash_get_elem( struct hash *myhash, char *key ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Lookup a key in a given hash and return the hash element - or NULL if not found. */ - - struct hash_elem *bucket; - - bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; - - while ( bucket != NULL ) - { - if ( strcmp( bucket->key, key ) == 0 ) { - return bucket; - } - - bucket = bucket->next; - } - - return NULL; -} - - -bool hash_del( struct hash *myhash, char *key ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Remove key/value pair from a given hash. */ - /* Returns true if a remove was successful. */ - - struct hash_elem *bucket; - - bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; - - while ( bucket != NULL ) - { - if ( strcmp( bucket->key, key ) == 0 ) - { - myhash->elem_count--; - return TRUE; - } - - bucket = bucket->next; - } - - return FALSE; -} - - -void hash_destroy( struct hash *myhash ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Deallocate memory for hash and all hash elements. */ - - int i; - struct hash_elem *bucket; - - for ( i = 0; i < myhash->table_size; i++ ) - { - for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) - { - mem_free( &bucket->key ); -// mem_free( bucket->val ); - mem_free( &bucket ); - } - } - - mem_free( ( void * ) &myhash->table ); - mem_free( ( void * ) &myhash ); + *hash_ppt = new_hash; } -uint hash_key( char *string ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Hash function that generates a hash key, */ - /* based on the Jim Kent's stuff. */ - - char *key = string; - uint result = 0; - int c; - - while ( ( c = *key++ ) != '\0' ) { - result += ( result << 3 ) + c; - } - - return result; -} - - -void hash_collision_stats( struct hash *myhash ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Output some collision stats for a given hash. */ - - /* Use with biotools: ... | plot_histogram -k Col -x */ - - int i; - int col; - struct hash_elem *bucket; - - for ( i = 0; i < myhash->table_size; i++ ) - { - col = 0; - - for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) { - col++; - } - - printf( "Col: %d\n---\n", col ); - } -} +//void hash_add( hash *hash_pt, char *key, void *val ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Add a new hash element consisting of a key/value pair to an existing hash. */ +// +// hash_elem *old_elem = NULL; +// hash_elem *new_elem = NULL; +// size_t hash_index = 0; +// +// if ( ( old_elem = hash_get_elem( hash_pt, key ) ) != NULL ) +// { +// old_elem->val = val; +// } +// else +// { +// new_elem = mem_get( sizeof( hash_elem ) ); +// +// hash_index = ( hash_key( key ) & hash_pt->mask ); +// +// new_elem->key = mem_clone( key, strlen( key ) ); +// new_elem->val = val; +// new_elem->next = hash_pt->table[ hash_index ]; +// +// hash_pt->table[ hash_index ] = new_elem; +// hash_pt->elem_count++; +// } +//} + + +//void *hash_get( struct hash *myhash, char *key ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Lookup a key in a given hash and return the value - or NULL if not found. */ +// +// struct hash_elem *bucket; +// +// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; +// +// while ( bucket != NULL ) +// { +// if ( strcmp( bucket->key, key ) == 0 ) { +// return bucket->val; +// } +// +// bucket = bucket->next; +// } +// +// return NULL; +//} +// +// +//struct hash_elem *hash_get_elem( struct hash *myhash, char *key ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Lookup a key in a given hash and return the hash element - or NULL if not found. */ +// +// struct hash_elem *bucket; +// +// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; +// +// while ( bucket != NULL ) +// { +// if ( strcmp( bucket->key, key ) == 0 ) { +// return bucket; +// } +// +// bucket = bucket->next; +// } +// +// return NULL; +//} +// +// +//bool hash_del( struct hash *myhash, char *key ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Remove key/value pair from a given hash. */ +// /* Returns true if a remove was successful. */ +// +// struct hash_elem *bucket; +// +// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ]; +// +// while ( bucket != NULL ) +// { +// if ( strcmp( bucket->key, key ) == 0 ) +// { +// myhash->elem_count--; +// return TRUE; +// } +// +// bucket = bucket->next; +// } +// +// return FALSE; +//} +// +// +//void hash_destroy( struct hash *myhash ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Deallocate memory for hash and all hash elements. */ +// +// int i; +// struct hash_elem *bucket; +// +// for ( i = 0; i < myhash->table_size; i++ ) +// { +// for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) +// { +// mem_free( &bucket->key ); +//// mem_free( bucket->val ); +// mem_free( &bucket ); +// } +// } +// +// mem_free( ( void * ) &myhash->table ); +// mem_free( ( void * ) &myhash ); +//} +// +// +//uint hash_key( char *string ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Hash function that generates a hash key, */ +// /* based on the Jim Kent's stuff. */ +// +// char *key = string; +// uint result = 0; +// int c; +// +// while ( ( c = *key++ ) != '\0' ) { +// result += ( result << 3 ) + c; +// } +// +// return result; +//} +// +// +//void hash_collision_stats( struct hash *myhash ) +//{ +// /* Martin A. Hansen, June 2008 */ +// +// /* Output some collision stats for a given hash. */ +// +// /* Use with biotools: ... | plot_histogram -k Col -x */ +// +// int i; +// int col; +// struct hash_elem *bucket; +// +// for ( i = 0; i < myhash->table_size; i++ ) +// { +// col = 0; +// +// for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) { +// col++; +// } +// +// printf( "Col: %d\n---\n", col ); +// } +//} diff --git a/code_c/Maasha/src/test/Makefile b/code_c/Maasha/src/test/Makefile index bd3b235..283030d 100644 --- a/code_c/Maasha/src/test/Makefile +++ b/code_c/Maasha/src/test/Makefile @@ -9,7 +9,7 @@ LIB = -lm $(LIB_DIR)*.o all: test -test: test_bits test_common test_fasta test_filesys test_list test_mem test_seq test_strings +test: test_bits test_common test_fasta test_filesys test_hash test_list test_mem test_seq test_strings test_bits: test_bits.c $(LIB_DIR)bits.c $(CC) $(Cflags) $(INC) $(LIB) test_bits.c -o test_bits @@ -23,6 +23,9 @@ test_fasta: test_fasta.c $(LIB_DIR)fasta.c test_filesys: test_filesys.c $(LIB_DIR)filesys.c $(CC) $(Cflags) $(INC) $(LIB) test_filesys.c -o test_filesys +test_hash: test_hash.c $(LIB_DIR)hash.c + $(CC) $(Cflags) $(INC) $(LIB) test_hash.c -o test_hash + test_list: test_list.c $(LIB_DIR)list.c $(CC) $(Cflags) $(INC) $(LIB) test_list.c -o test_list @@ -40,6 +43,7 @@ clean: rm test_common rm test_fasta rm test_filesys + rm test_hash rm test_list rm test_mem rm test_seq diff --git a/code_c/Maasha/src/test/test_hash.c b/code_c/Maasha/src/test/test_hash.c new file mode 100644 index 0000000..d4a5041 --- /dev/null +++ b/code_c/Maasha/src/test/test_hash.c @@ -0,0 +1,35 @@ +#include "common.h" +#include "mem.h" +#include "hash.h" + +static void test_hash_new(); + +int main() +{ + fprintf( stderr, "Running all tests for hash.c\n" ); + + test_hash_new(); + + fprintf( stderr, "Done\n\n" ); + + return EXIT_SUCCESS; +} + + +void test_hash_new() +{ + fprintf( stderr, " Testing hash_new ... " ); + + hash *hash_pt = NULL; + int size = 256; + + hash_new( &hash_pt, size ); + + assert( hash_pt->table_size == ( 1 << size ) ); + assert( hash_pt->mask == ( 1 << size ) - 1 ); + assert( hash_pt->elem_count == 0 ); + + fprintf( stderr, "OK\n" ); +} + + diff --git a/code_c/Maasha/src/test/test_list.c b/code_c/Maasha/src/test/test_list.c index 6e743ee..0cb598d 100644 --- a/code_c/Maasha/src/test/test_list.c +++ b/code_c/Maasha/src/test/test_list.c @@ -79,6 +79,10 @@ void test_list_sl_add_beg() node2->val = "TEST2"; node3->val = "TEST3"; + node1->next = NULL; + node2->next = NULL; + node3->next = NULL; + assert( list->first == NULL ); list_sl_add_beg( &list, &node1 ); @@ -93,6 +97,21 @@ void test_list_sl_add_beg() assert( list->first == node3 ); assert( strcmp( list->first->val, "TEST3" ) == 0 ); + node2->val = "FOP"; + + list_sl_print( list ); + + node1->val = "TEST4"; + node1->next = NULL; + + list_sl_add_beg( &list, &node1 ); + assert( list->first == node1 ); + assert( strcmp( list->first->val, "TEST4" ) == 0 ); + + node1->next = NULL; + + list_sl_print( list ); + fprintf( stderr, "OK\n" ); } @@ -518,3 +537,4 @@ void test_list_dl_destroy() fprintf( stderr, "OK\n" ); } + diff --git a/code_c/Maasha/src/tetra_count.c b/code_c/Maasha/src/tetra_count.c index 5d7a481..7113c00 100644 --- a/code_c/Maasha/src/tetra_count.c +++ b/code_c/Maasha/src/tetra_count.c @@ -4,120 +4,89 @@ #include "seq.h" #include "fasta.h" -#define BITS32 15 -#define ARRAY_SIZE ( 1 << ( BITS32 * 2 ) ) - -#define BLOCK_SIZE 4 -#define MAX_SPACE 256 +/* + * bipartite_scan locates motivs consisting of two blocks of tetra nucleotides seperated by + * a distance between 0 and 256 bases. This is done by converting the tetra nucleotide blocks + * into 2-bits per base thus using the first 16 bits of a unsigned int (which is 32 bits). The + * remaning 16 bits exactly holds the distance (binary) between the blocks (using 9 bits for 256). + * For example in the following sequence the two tetra nucletotide blocks (in capital letters) + * are sepetarated by 13 bases: + * + * tatagtacgttATCGatgagctagctgtTGCAgtgtgatac + * + * Thus the resulting parts of the count block will look like this: + * + * ATCG=00110110 TGCA=11100100 and 13=000001101 (using A=00 T=11 C=01 and G=10). + * + * And the resulting bipartite block is then: 00110110111001000000011010000000 + * The remainin 7 bits are unused. + * + * The biparite block can then be used as an index in an unsigned int array that holds the count + * of all different bipartite blocks found in a sequence or list of sequnces (all genomes!). + * + * Rescanning the sequences locating the bipartite blocks of interest allows location of the + * exact motif positions. + * + * */ + +#define BLOCK_SIZE 4 +#define MAX_SPACE 256 +//#define ARRAY_SIZE ( 1 << 30 ) +#define ARRAY_SIZE 30 +#define WINDOW_SIZE ( ( 2 * BLOCK_SIZE ) + MAX_SPACE ) #define add_A( c ) /* add 00 on the rightmost two bits of c (i. e. do nothing). */ #define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ #define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ #define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ -typedef uint bits32; - -uint mask_create( int size ); -void block_count( char *path, uint **bit32_array_ppt, seq_entry **entry_ppt, uint mask ); - +void bipartite_scan_file( char *file, uint **bipartite_array_ppt, seq_entry **entry_ppt ); +void bipartite_scan_entry( uint *bipartite_array, seq_entry *entry ); int main( int argc, char *argv[] ) { - uint mask = 0; - int i = 0; - uint *bit32_array = NULL; - seq_entry *entry = NULL; + int i = 0; + uint *bipartite_array = NULL; + seq_entry *entry = NULL; - mask = mask_create( BLOCK_SIZE ); - - bit32_array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE ); + bipartite_array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE ); seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ ); - for ( i = 1; i < argc; i++ ) { - block_count( argv[ i ], &bit32_array, &entry, mask ); + for ( i = 1; i < argc; i++ ) + { + bipartite_scan_file( argv[ i ], &bipartite_array, &entry ); } + seq_destroy( entry ); + + mem_free( &bipartite_array ); + return EXIT_SUCCESS; } -uint mask_create( int size ) +void bipartite_scan_file( char *file, uint **bipartite_array_ppt, seq_entry **entry_ppt ) { - /* Martin A. Hansen, June 2008 */ - - /* Create a bit mask. */ + /* Martin A. Hansen, September 2008. */ - uint i = 0; - uint mask = 0; + uint *bipartite_array = *bipartite_array_ppt; + seq_entry *entry = *entry_ppt; + FILE *fp = NULL; - for ( i = 0; i < size; i++ ) - { - mask <<= 2; + fp = read_open( file ); - mask |= 3; /* add 11 to mask */ + while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) { + bipartite_scan_entry( bipartite_array, entry ); } - return mask; + close_stream( fp ); } -void block_count( char *path, uint **bit32_array_ppt, seq_entry **entry_ppt, uint mask ) +void bipartite_scan_entry( uint *bipartite_array, seq_entry *entry ) { - /* Martin A. Hansen, August 2008 */ - - /* Scan a FASTA file for bi-partive motifs consisting of two */ - /* blocks of BLOCKS_SIZE separated by up to MAX_SPACE space. */ + /* Martin A. Hansen, September 2008. */ - uint *bit32_array = *bit32_array_ppt; - seq_entry *entry = *entry_ppt; - FILE *fp = NULL; - uint i = 0; - uint j = 0; - uint bin = 0; - - fp = read_open( path ); - - fprintf( stderr, "Processing: %s\n", path ); - - while ( ( fasta_get_entry( fp, &entry ) ) != 0 ) - { - fprintf( stderr, " Counting blocks in: %s ... ", entry->seq_name ); - - bin = 0; - j = 0; - - for ( i = 0; entry->seq[ i ]; i++ ) - { - bin <<= 2; - - switch( entry->seq[ i ] ) - { - case 'A': case 'a': add_A( bin ); j++; break; - case 'T': case 't': add_T( bin ); j++; break; - case 'C': case 'c': add_C( bin ); j++; break; - case 'G': case 'g': add_G( bin ); j++; break; - default: bin = 0; j = 0; break; - } - - if ( j >= BLOCK_SIZE - 1 ) - { - bit32_array[ ( bin & mask ) ]++; -/* - printf( "\n" ); - printf( "mask : %s\n", bits2string( mask ) ); - printf( "bin : %s\n", bits2string( bin ) ); - printf( "bin & mask: %s\n", bits2string( bin & mask ) ); -*/ - } - } - - fprintf( stderr, "done.\n" ); - } - - fprintf( stderr, "done.\n" ); - - close_stream( fp ); + fasta_put_entry( entry ); } - - diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 29a914a..3344774 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -199,6 +199,7 @@ sub run_script elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) } elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) } elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) } + elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) } elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) } elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) } elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) } @@ -440,6 +441,11 @@ sub get_options percent|p ); } + elsif ( $script eq "calc_fixedstep" ) + { + @options = qw( + ); + } elsif ( $script eq "transliterate_seq" ) { @options = qw( @@ -2486,6 +2492,110 @@ sub script_calc_bit_scores } +sub script_calc_fixedstep +{ + # Martin A. Hansen, September 2008. + + # Calculates fixedstep entries from data in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %fh_hash, $fh_in, $fh_out, $chr, $chr, $beg, $end, $q_id, $block, $entry, $clones, $beg_block, $max, $i ); + + while ( $record = get_record( $in ) ) + { + $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" }; + $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" }; + $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" }; + + if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } }; + + $fh_out = $fh_hash{ $record->{ "CHR" } }; + + Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 ); + } + } + + map { close $_ } keys %fh_hash; + + foreach $chr ( sort keys %fh_hash ) + { + Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" ); + + $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" ); + + undef $block; + + while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) ) + { + $chr = $entry->{ 'CHR' }; + $beg = $entry->{ 'CHR_BEG' }; + $end = $entry->{ 'CHR_END' }; + $q_id = $entry->{ 'Q_ID' }; + + if ( $options->{ "score" } ) { + $clones = $entry->{ 'SCORE' }; + } elsif ( $q_id =~ /_(\d+)$/ ) { + $clones = $1; + } else { + $clones = 1; + } + + if ( $block ) + { + if ( $beg > $max ) + { + $record->{ "CHR" } = $chr; + $record->{ "CHR_BEG" } = $beg_block; + $record->{ "STEP" } = 1; + $record->{ "VALS" } = join ";", @{ $block }; + + put_record( $record, $out ); + + undef $block; + } + else + { + for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) { + $block->[ $i ] += $clones; + } + + $max = Maasha::Calc::max( $max, $end ); + } + } + + if ( not $block ) + { + $beg_block = $beg; + $max = $end; + + for ( $i = 0; $i < ( $end - $beg ); $i++ ) { + $block->[ $i ] += $clones; + } + } + } + + close $fh_in; + + $record->{ "CHR" } = $chr; + $record->{ "CHR_BEG" } = $beg_block; + $record->{ "STEP" } = 1; + $record->{ "VALS" } = join ";", @{ $block }; + + put_record( $record, $out ); + + unlink "$BP_TMP/$chr"; + } +} + + sub script_reverse_seq { # Martin A. Hansen, August 2007. @@ -5988,7 +6098,7 @@ sub script_upload_to_ucsc { if ( $beg > $max ) { - Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out ); + Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out, $options->{ "log10" } ); undef $block; } else