--- /dev/null
+#!/usr/bin/env perl
+
+use warnings;
+use strict;
+
+use Maasha::Biopieces;
INC = -I $(INC_DIR)
LIB = -lm $(LIB_DIR)*.o
-all: libs utest fasta_count repeat-O-matic tetra_count
+all: libs utest bipartite_scan fasta_count repeat-O-matic tetra_count
libs:
cd $(LIB_DIR) && ${MAKE} all
utest:
cd $(TEST_DIR) && ${MAKE} all
+bipartite_scan: bipartite_scan.c
+ $(CC) $(Cflags) $(INC) $(LIB) bipartite_scan.c -o bipartite_scan
+
fasta_count: fasta_count.c
$(CC) $(Cflags) $(INC) $(LIB) fasta_count.c -o fasta_count
clean:
cd $(LIB_DIR) && ${MAKE} clean
cd $(TEST_DIR) && ${MAKE} clean
+ rm bipartite_scan
rm fasta_count
rm repeat-O-matic
rm tetra_count
--- /dev/null
+#include "common.h"
+#include "mem.h"
+#include "list.h"
+
+#define BLOCK_SIZE_BITS 8 /* one block holds 8 bits */
+#define BLOCK_SIZE_NT 4 /* one block holds 4 nucleotides */
+#define BITS_IN_NT 2 /* two bits holds 1 nucleotide */
+
+#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
+#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
+#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
+#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
+
+typedef unsigned char block;
+
+char *block2dna[256] = {
+ "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
+ "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
+ "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
+ "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
+ "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
+ "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
+ "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
+ "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
+ "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
+ "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
+ "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
+ "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
+ "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
+ "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
+ "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
+ "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
+ "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
+ "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
+ "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
+ "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
+ "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
+ "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
+ "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
+ "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
+ "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
+ "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
+ "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
+ "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
+ "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
+ "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
+ "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
+ "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
+};
+
+bool block_fill( char *seq, size_t offset, block *new_block );
+void block_print( list_sl *list );
+void scan_seq( char *seq, size_t seq_len );
+
+static void run_tests();
+static void test_block_fill();
+static void test_scan_seq();
+
+
+int main()
+{
+ run_tests();
+
+ return EXIT_SUCCESS;
+}
+
+
+bool block_fill( char *seq, size_t offset, block *new_block_pt )
+{
+ /* Martin A. Hansen, September 2008. */
+
+ /* Fills a block with sequence (converted to bits) from */
+ /* a given offset. If the sequence contains non-DNA residues */
+ /* the function returns FALSE. */
+
+ block new_block = *new_block_pt;
+ int i = 0;
+
+ for ( i = 0; i < BLOCK_SIZE_NT; i++ )
+ {
+ new_block <<= BITS_IN_NT;
+
+ switch( seq[ offset + i ] )
+ {
+ case 'A': case 'a': add_A( new_block ); break;
+ case 'T': case 't': add_T( new_block ); break;
+ case 'C': case 'c': add_C( new_block ); break;
+ case 'G': case 'g': add_G( new_block ); break;
+ default: return FALSE;
+ }
+ }
+
+ *new_block_pt = new_block;
+
+ return TRUE;
+}
+
+
+void scan_seq( char *seq, size_t seq_len )
+{
+ /* Martin A. Hansen, September 2008. */
+
+ block b = 0;
+ short b_count = 0;
+ size_t i = 0;
+ list_sl *list = NULL;
+ node_sl *node_new = NULL;
+ node_sl *node_old = NULL;
+ bool first_node = TRUE;
+
+ list_sl_new( &list );
+
+ node_new = mem_get( sizeof( node_sl ) );
+ node_old = mem_get( sizeof( node_sl ) );
+
+ while ( i < seq_len - BLOCK_SIZE_NT + 1 )
+ {
+ b <<= BITS_IN_NT;
+
+ switch( seq[ i ] )
+ {
+ case 'A': case 'a': add_A( b ); b_count++; break;
+ case 'T': case 't': add_T( b ); b_count++; break;
+ case 'C': case 'c': add_C( b ); b_count++; break;
+ case 'G': case 'g': add_G( b ); b_count++; break;
+ default: b = 0; b_count = 0; break;
+ }
+
+ if ( b_count >= BLOCK_SIZE_NT )
+ {
+ node_new->val = "FISK";
+
+ if ( first_node )
+ {
+ list_sl_add_beg( &list, &node_new );
+ }
+ else
+ {
+ list_sl_add_after( &node_old, &node_new );
+ }
+
+ node_old = node_new;
+
+ first_node = FALSE;
+ }
+
+ i++;
+ }
+
+ block_print( list );
+
+ list_sl_destroy( &list );
+}
+
+
+void block_print( list_sl *list )
+{
+ node_sl *node = list->first;
+
+ while ( node != NULL )
+ {
+ printf( "BLOCK: %s\n", ( char * ) node->val );
+
+ node = node->next;
+ }
+}
+
+
+/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> UNIT TESTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
+
+
+void run_tests()
+{
+ printf( "Running tests\n" );
+
+ test_block_fill();
+ test_scan_seq();
+
+ printf( "All tests OK\n" );
+}
+
+
+void test_block_fill()
+{
+ printf( " Running test_block_fill ... " );
+
+ char *seq = "AAAATCGGCTA";
+ size_t seq_len = strlen( seq );
+ size_t offset = 0;
+ block new_block = 0;
+
+ for ( offset = 0; offset < seq_len - BLOCK_SIZE_NT + 1; offset++ )
+ {
+ assert( ( block_fill( seq, offset, &new_block ) == TRUE ) );
+
+// printf( "BLOCK: %s\n", block2dna[ new_block ] );
+ }
+
+ printf( "done.\n");
+}
+
+
+void test_scan_seq()
+{
+ printf( " Running test_scan_seq ... " );
+
+ char *seq = "AAAATCGGCTA";
+ size_t seq_len = strlen( seq );
+
+ scan_seq( seq, seq_len );
+
+ printf( "done.\n");
+}
+
+
+/* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< */
+
+
-/* Structure of a generic hash. */
-struct hash
+/* Structure of a generic hash element. */
+struct _hash_elem
{
- struct hash_elem **table;
- uint mask;
- int table_size;
- int elem_count;
+ struct _hash_elem *next;
+ char *key;
+ void *val;
};
-/* Structure of a generic hash element. */
-struct hash_elem
+typedef struct _hash_elem hash_elem;
+
+/* Structure of a generic hash. */
+struct _hash
{
- struct hash_elem *next;
- char *key;
- void *val;
+ hash_elem **table;
+ uint mask;
+ int table_size;
+ int elem_count;
};
+typedef struct _hash hash;
+
/* Initialize a new generic hash structure. */
-struct hash *hash_new( size_t size );
+void hash_new( hash **hash_pt, size_t size );
/* Hash function that generates a hash key, */
-uint hash_key( char *string );
+uint hash_key( char *string );
/* Add a new hash element consisting of a key/value pair to an existing hash. */
-void hash_add( struct hash *myhash, char *key, void *val );
+void hash_add( hash *hash_pt, char *key, void *val );
/* Lookup a key in a given hash and return the value - or NULL if not found. */
-void *hash_get( struct hash *myhash, char *key );
+void *hash_get( hash *hash_pt, char *key );
/* Lookup a key in a given hash and return the hash element - or NULL if not found. */
-struct hash_elem *hash_get_elem( struct hash *myhash, char *key );
+hash_elem *hash_get_elem( hash *hash_pt, char *key );
/* Remove key/value pair from a given hash. Returns true if a remove was successful. */
-bool hash_del( struct hash *myhash, char *key );
+bool hash_del( hash *hash_pt, char *key );
/* Deallocate memory for hash and all hash elements. */
-void hash_destroy( struct hash *myhash );
+void hash_destroy( hash *hash_pt );
/* Output some collision stats for a given hash. */
-void hash_collision_stats( struct hash *myhash );
+void hash_collision_stats( hash *hash_pt );
+
+
#include "list.h"
-struct hash *hash_new( size_t size )
+void hash_new( hash **hash_ppt, size_t size )
{
/* Martin A. Hansen, June 2008 */
/* Initialize a new generic hash structure. */
- struct hash *new_hash;
- int table_size;
+ hash *new_hash = *hash_ppt;
+ size_t table_size = 0;
new_hash = mem_get( sizeof( new_hash ) );
new_hash->table_size = table_size;
new_hash->mask = table_size - 1;
- new_hash->table = mem_get( sizeof( struct hash_elem * ) * table_size );
-
+ new_hash->table = mem_get( sizeof( hash_elem * ) * table_size );
new_hash->elem_count = 0;
- return new_hash;
-}
-
-
-void hash_add( struct hash *myhash, char *key, void *val )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Add a new hash element consisting of a key/value pair to an existing hash. */
-
- struct hash_elem *old_elem;
- struct hash_elem *new_elem;
- int hash_index;
-
- if ( ( old_elem = hash_get_elem( myhash, key ) ) != NULL )
- {
- old_elem->val = val;
- }
- else
- {
- new_elem = mem_get( sizeof( new_elem ) );
-
- hash_index = ( hash_key( key ) & myhash->mask );
-
- new_elem->key = mem_clone( key, strlen( key ) );
- new_elem->val = val;
- new_elem->next = myhash->table[ hash_index ];
-
- myhash->table[ hash_index ] = new_elem;
- myhash->elem_count++;
- }
-}
-
-
-void *hash_get( struct hash *myhash, char *key )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Lookup a key in a given hash and return the value - or NULL if not found. */
-
- struct hash_elem *bucket;
-
- bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
-
- while ( bucket != NULL )
- {
- if ( strcmp( bucket->key, key ) == 0 ) {
- return bucket->val;
- }
-
- bucket = bucket->next;
- }
-
- return NULL;
-}
-
-
-struct hash_elem *hash_get_elem( struct hash *myhash, char *key )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Lookup a key in a given hash and return the hash element - or NULL if not found. */
-
- struct hash_elem *bucket;
-
- bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
-
- while ( bucket != NULL )
- {
- if ( strcmp( bucket->key, key ) == 0 ) {
- return bucket;
- }
-
- bucket = bucket->next;
- }
-
- return NULL;
-}
-
-
-bool hash_del( struct hash *myhash, char *key )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Remove key/value pair from a given hash. */
- /* Returns true if a remove was successful. */
-
- struct hash_elem *bucket;
-
- bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
-
- while ( bucket != NULL )
- {
- if ( strcmp( bucket->key, key ) == 0 )
- {
- myhash->elem_count--;
- return TRUE;
- }
-
- bucket = bucket->next;
- }
-
- return FALSE;
-}
-
-
-void hash_destroy( struct hash *myhash )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Deallocate memory for hash and all hash elements. */
-
- int i;
- struct hash_elem *bucket;
-
- for ( i = 0; i < myhash->table_size; i++ )
- {
- for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next )
- {
- mem_free( &bucket->key );
-// mem_free( bucket->val );
- mem_free( &bucket );
- }
- }
-
- mem_free( ( void * ) &myhash->table );
- mem_free( ( void * ) &myhash );
+ *hash_ppt = new_hash;
}
-uint hash_key( char *string )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Hash function that generates a hash key, */
- /* based on the Jim Kent's stuff. */
-
- char *key = string;
- uint result = 0;
- int c;
-
- while ( ( c = *key++ ) != '\0' ) {
- result += ( result << 3 ) + c;
- }
-
- return result;
-}
-
-
-void hash_collision_stats( struct hash *myhash )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Output some collision stats for a given hash. */
-
- /* Use with biotools: ... | plot_histogram -k Col -x */
-
- int i;
- int col;
- struct hash_elem *bucket;
-
- for ( i = 0; i < myhash->table_size; i++ )
- {
- col = 0;
-
- for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) {
- col++;
- }
-
- printf( "Col: %d\n---\n", col );
- }
-}
+//void hash_add( hash *hash_pt, char *key, void *val )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Add a new hash element consisting of a key/value pair to an existing hash. */
+//
+// hash_elem *old_elem = NULL;
+// hash_elem *new_elem = NULL;
+// size_t hash_index = 0;
+//
+// if ( ( old_elem = hash_get_elem( hash_pt, key ) ) != NULL )
+// {
+// old_elem->val = val;
+// }
+// else
+// {
+// new_elem = mem_get( sizeof( hash_elem ) );
+//
+// hash_index = ( hash_key( key ) & hash_pt->mask );
+//
+// new_elem->key = mem_clone( key, strlen( key ) );
+// new_elem->val = val;
+// new_elem->next = hash_pt->table[ hash_index ];
+//
+// hash_pt->table[ hash_index ] = new_elem;
+// hash_pt->elem_count++;
+// }
+//}
+
+
+//void *hash_get( struct hash *myhash, char *key )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Lookup a key in a given hash and return the value - or NULL if not found. */
+//
+// struct hash_elem *bucket;
+//
+// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
+//
+// while ( bucket != NULL )
+// {
+// if ( strcmp( bucket->key, key ) == 0 ) {
+// return bucket->val;
+// }
+//
+// bucket = bucket->next;
+// }
+//
+// return NULL;
+//}
+//
+//
+//struct hash_elem *hash_get_elem( struct hash *myhash, char *key )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Lookup a key in a given hash and return the hash element - or NULL if not found. */
+//
+// struct hash_elem *bucket;
+//
+// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
+//
+// while ( bucket != NULL )
+// {
+// if ( strcmp( bucket->key, key ) == 0 ) {
+// return bucket;
+// }
+//
+// bucket = bucket->next;
+// }
+//
+// return NULL;
+//}
+//
+//
+//bool hash_del( struct hash *myhash, char *key )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Remove key/value pair from a given hash. */
+// /* Returns true if a remove was successful. */
+//
+// struct hash_elem *bucket;
+//
+// bucket = myhash->table[ ( hash_key( key ) & myhash->mask ) ];
+//
+// while ( bucket != NULL )
+// {
+// if ( strcmp( bucket->key, key ) == 0 )
+// {
+// myhash->elem_count--;
+// return TRUE;
+// }
+//
+// bucket = bucket->next;
+// }
+//
+// return FALSE;
+//}
+//
+//
+//void hash_destroy( struct hash *myhash )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Deallocate memory for hash and all hash elements. */
+//
+// int i;
+// struct hash_elem *bucket;
+//
+// for ( i = 0; i < myhash->table_size; i++ )
+// {
+// for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next )
+// {
+// mem_free( &bucket->key );
+//// mem_free( bucket->val );
+// mem_free( &bucket );
+// }
+// }
+//
+// mem_free( ( void * ) &myhash->table );
+// mem_free( ( void * ) &myhash );
+//}
+//
+//
+//uint hash_key( char *string )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Hash function that generates a hash key, */
+// /* based on the Jim Kent's stuff. */
+//
+// char *key = string;
+// uint result = 0;
+// int c;
+//
+// while ( ( c = *key++ ) != '\0' ) {
+// result += ( result << 3 ) + c;
+// }
+//
+// return result;
+//}
+//
+//
+//void hash_collision_stats( struct hash *myhash )
+//{
+// /* Martin A. Hansen, June 2008 */
+//
+// /* Output some collision stats for a given hash. */
+//
+// /* Use with biotools: ... | plot_histogram -k Col -x */
+//
+// int i;
+// int col;
+// struct hash_elem *bucket;
+//
+// for ( i = 0; i < myhash->table_size; i++ )
+// {
+// col = 0;
+//
+// for ( bucket = myhash->table[ i ]; bucket != NULL; bucket = bucket->next ) {
+// col++;
+// }
+//
+// printf( "Col: %d\n---\n", col );
+// }
+//}
all: test
-test: test_bits test_common test_fasta test_filesys test_list test_mem test_seq test_strings
+test: test_bits test_common test_fasta test_filesys test_hash test_list test_mem test_seq test_strings
test_bits: test_bits.c $(LIB_DIR)bits.c
$(CC) $(Cflags) $(INC) $(LIB) test_bits.c -o test_bits
test_filesys: test_filesys.c $(LIB_DIR)filesys.c
$(CC) $(Cflags) $(INC) $(LIB) test_filesys.c -o test_filesys
+test_hash: test_hash.c $(LIB_DIR)hash.c
+ $(CC) $(Cflags) $(INC) $(LIB) test_hash.c -o test_hash
+
test_list: test_list.c $(LIB_DIR)list.c
$(CC) $(Cflags) $(INC) $(LIB) test_list.c -o test_list
rm test_common
rm test_fasta
rm test_filesys
+ rm test_hash
rm test_list
rm test_mem
rm test_seq
--- /dev/null
+#include "common.h"
+#include "mem.h"
+#include "hash.h"
+
+static void test_hash_new();
+
+int main()
+{
+ fprintf( stderr, "Running all tests for hash.c\n" );
+
+ test_hash_new();
+
+ fprintf( stderr, "Done\n\n" );
+
+ return EXIT_SUCCESS;
+}
+
+
+void test_hash_new()
+{
+ fprintf( stderr, " Testing hash_new ... " );
+
+ hash *hash_pt = NULL;
+ int size = 256;
+
+ hash_new( &hash_pt, size );
+
+ assert( hash_pt->table_size == ( 1 << size ) );
+ assert( hash_pt->mask == ( 1 << size ) - 1 );
+ assert( hash_pt->elem_count == 0 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
node2->val = "TEST2";
node3->val = "TEST3";
+ node1->next = NULL;
+ node2->next = NULL;
+ node3->next = NULL;
+
assert( list->first == NULL );
list_sl_add_beg( &list, &node1 );
assert( list->first == node3 );
assert( strcmp( list->first->val, "TEST3" ) == 0 );
+ node2->val = "FOP";
+
+ list_sl_print( list );
+
+ node1->val = "TEST4";
+ node1->next = NULL;
+
+ list_sl_add_beg( &list, &node1 );
+ assert( list->first == node1 );
+ assert( strcmp( list->first->val, "TEST4" ) == 0 );
+
+ node1->next = NULL;
+
+ list_sl_print( list );
+
fprintf( stderr, "OK\n" );
}
fprintf( stderr, "OK\n" );
}
+
#include "seq.h"
#include "fasta.h"
-#define BITS32 15
-#define ARRAY_SIZE ( 1 << ( BITS32 * 2 ) )
-
-#define BLOCK_SIZE 4
-#define MAX_SPACE 256
+/*
+ * bipartite_scan locates motivs consisting of two blocks of tetra nucleotides seperated by
+ * a distance between 0 and 256 bases. This is done by converting the tetra nucleotide blocks
+ * into 2-bits per base thus using the first 16 bits of a unsigned int (which is 32 bits). The
+ * remaning 16 bits exactly holds the distance (binary) between the blocks (using 9 bits for 256).
+ * For example in the following sequence the two tetra nucletotide blocks (in capital letters)
+ * are sepetarated by 13 bases:
+ *
+ * tatagtacgttATCGatgagctagctgtTGCAgtgtgatac
+ *
+ * Thus the resulting parts of the count block will look like this:
+ *
+ * ATCG=00110110 TGCA=11100100 and 13=000001101 (using A=00 T=11 C=01 and G=10).
+ *
+ * And the resulting bipartite block is then: 00110110111001000000011010000000
+ * The remainin 7 bits are unused.
+ *
+ * The biparite block can then be used as an index in an unsigned int array that holds the count
+ * of all different bipartite blocks found in a sequence or list of sequnces (all genomes!).
+ *
+ * Rescanning the sequences locating the bipartite blocks of interest allows location of the
+ * exact motif positions.
+ *
+ * */
+
+#define BLOCK_SIZE 4
+#define MAX_SPACE 256
+//#define ARRAY_SIZE ( 1 << 30 )
+#define ARRAY_SIZE 30
+#define WINDOW_SIZE ( ( 2 * BLOCK_SIZE ) + MAX_SPACE )
#define add_A( c ) /* add 00 on the rightmost two bits of c (i. e. do nothing). */
#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
-typedef uint bits32;
-
-uint mask_create( int size );
-void block_count( char *path, uint **bit32_array_ppt, seq_entry **entry_ppt, uint mask );
-
+void bipartite_scan_file( char *file, uint **bipartite_array_ppt, seq_entry **entry_ppt );
+void bipartite_scan_entry( uint *bipartite_array, seq_entry *entry );
int main( int argc, char *argv[] )
{
- uint mask = 0;
- int i = 0;
- uint *bit32_array = NULL;
- seq_entry *entry = NULL;
+ int i = 0;
+ uint *bipartite_array = NULL;
+ seq_entry *entry = NULL;
- mask = mask_create( BLOCK_SIZE );
-
- bit32_array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE );
+ bipartite_array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE );
seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ );
- for ( i = 1; i < argc; i++ ) {
- block_count( argv[ i ], &bit32_array, &entry, mask );
+ for ( i = 1; i < argc; i++ )
+ {
+ bipartite_scan_file( argv[ i ], &bipartite_array, &entry );
}
+ seq_destroy( entry );
+
+ mem_free( &bipartite_array );
+
return EXIT_SUCCESS;
}
-uint mask_create( int size )
+void bipartite_scan_file( char *file, uint **bipartite_array_ppt, seq_entry **entry_ppt )
{
- /* Martin A. Hansen, June 2008 */
-
- /* Create a bit mask. */
+ /* Martin A. Hansen, September 2008. */
- uint i = 0;
- uint mask = 0;
+ uint *bipartite_array = *bipartite_array_ppt;
+ seq_entry *entry = *entry_ppt;
+ FILE *fp = NULL;
- for ( i = 0; i < size; i++ )
- {
- mask <<= 2;
+ fp = read_open( file );
- mask |= 3; /* add 11 to mask */
+ while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) {
+ bipartite_scan_entry( bipartite_array, entry );
}
- return mask;
+ close_stream( fp );
}
-void block_count( char *path, uint **bit32_array_ppt, seq_entry **entry_ppt, uint mask )
+void bipartite_scan_entry( uint *bipartite_array, seq_entry *entry )
{
- /* Martin A. Hansen, August 2008 */
-
- /* Scan a FASTA file for bi-partive motifs consisting of two */
- /* blocks of BLOCKS_SIZE separated by up to MAX_SPACE space. */
+ /* Martin A. Hansen, September 2008. */
- uint *bit32_array = *bit32_array_ppt;
- seq_entry *entry = *entry_ppt;
- FILE *fp = NULL;
- uint i = 0;
- uint j = 0;
- uint bin = 0;
-
- fp = read_open( path );
-
- fprintf( stderr, "Processing: %s\n", path );
-
- while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
- {
- fprintf( stderr, " Counting blocks in: %s ... ", entry->seq_name );
-
- bin = 0;
- j = 0;
-
- for ( i = 0; entry->seq[ i ]; i++ )
- {
- bin <<= 2;
-
- switch( entry->seq[ i ] )
- {
- case 'A': case 'a': add_A( bin ); j++; break;
- case 'T': case 't': add_T( bin ); j++; break;
- case 'C': case 'c': add_C( bin ); j++; break;
- case 'G': case 'g': add_G( bin ); j++; break;
- default: bin = 0; j = 0; break;
- }
-
- if ( j >= BLOCK_SIZE - 1 )
- {
- bit32_array[ ( bin & mask ) ]++;
-/*
- printf( "\n" );
- printf( "mask : %s\n", bits2string( mask ) );
- printf( "bin : %s\n", bits2string( bin ) );
- printf( "bin & mask: %s\n", bits2string( bin & mask ) );
-*/
- }
- }
-
- fprintf( stderr, "done.\n" );
- }
-
- fprintf( stderr, "done.\n" );
-
- close_stream( fp );
+ fasta_put_entry( entry );
}
-
-
elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) }
elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) }
elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) }
+ elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) }
elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) }
elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) }
elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) }
percent|p
);
}
+ elsif ( $script eq "calc_fixedstep" )
+ {
+ @options = qw(
+ );
+ }
elsif ( $script eq "transliterate_seq" )
{
@options = qw(
}
+sub script_calc_fixedstep
+{
+ # Martin A. Hansen, September 2008.
+
+ # Calculates fixedstep entries from data in the stream.
+
+ my ( $in, # handle to in stream
+ $out, # handle to out stream
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $record, %fh_hash, $fh_in, $fh_out, $chr, $chr, $beg, $end, $q_id, $block, $entry, $clones, $beg_block, $max, $i );
+
+ while ( $record = get_record( $in ) )
+ {
+ $record->{ "CHR" } = $record->{ "S_ID" } if not defined $record->{ "CHR" };
+ $record->{ "CHR_BEG" } = $record->{ "S_BEG" } if not defined $record->{ "CHR_BEG" };
+ $record->{ "CHR_END" } = $record->{ "S_END" } if not defined $record->{ "CHR_END" };
+
+ if ( $record->{ "CHR" } and defined $record->{ "CHR_BEG" } and $record->{ "CHR_END" } )
+ {
+ $fh_hash{ $record->{ "CHR" } } = Maasha::Common::write_open( "$BP_TMP/$record->{ 'CHR' }" ) if not exists $fh_hash{ $record->{ "CHR" } };
+
+ $fh_out = $fh_hash{ $record->{ "CHR" } };
+
+ Maasha::UCSC::bed_put_entry( $record, $fh_out, 5 );
+ }
+ }
+
+ map { close $_ } keys %fh_hash;
+
+ foreach $chr ( sort keys %fh_hash )
+ {
+ Maasha::Common::run( "bedSort", "$BP_TMP/$chr $BP_TMP/$chr" );
+
+ $fh_in = Maasha::Common::read_open( "$BP_TMP/$chr" );
+
+ undef $block;
+
+ while ( $entry = Maasha::UCSC::bed_get_entry( $fh_in, 5 ) )
+ {
+ $chr = $entry->{ 'CHR' };
+ $beg = $entry->{ 'CHR_BEG' };
+ $end = $entry->{ 'CHR_END' };
+ $q_id = $entry->{ 'Q_ID' };
+
+ if ( $options->{ "score" } ) {
+ $clones = $entry->{ 'SCORE' };
+ } elsif ( $q_id =~ /_(\d+)$/ ) {
+ $clones = $1;
+ } else {
+ $clones = 1;
+ }
+
+ if ( $block )
+ {
+ if ( $beg > $max )
+ {
+ $record->{ "CHR" } = $chr;
+ $record->{ "CHR_BEG" } = $beg_block;
+ $record->{ "STEP" } = 1;
+ $record->{ "VALS" } = join ";", @{ $block };
+
+ put_record( $record, $out );
+
+ undef $block;
+ }
+ else
+ {
+ for ( $i = $beg - $beg_block; $i < ( $beg - $beg_block ) + ( $end - $beg ); $i++ ) {
+ $block->[ $i ] += $clones;
+ }
+
+ $max = Maasha::Calc::max( $max, $end );
+ }
+ }
+
+ if ( not $block )
+ {
+ $beg_block = $beg;
+ $max = $end;
+
+ for ( $i = 0; $i < ( $end - $beg ); $i++ ) {
+ $block->[ $i ] += $clones;
+ }
+ }
+ }
+
+ close $fh_in;
+
+ $record->{ "CHR" } = $chr;
+ $record->{ "CHR_BEG" } = $beg_block;
+ $record->{ "STEP" } = 1;
+ $record->{ "VALS" } = join ";", @{ $block };
+
+ put_record( $record, $out );
+
+ unlink "$BP_TMP/$chr";
+ }
+}
+
+
sub script_reverse_seq
{
# Martin A. Hansen, August 2007.
{
if ( $beg > $max )
{
- Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out );
+ Maasha::UCSC::fixedstep_put_entry( $chr, $beg_block, $block, $fh_out, $options->{ "log10" } );
undef $block;
}
else