#include "seq.h"
#include "fasta.h"
+
// #define OLIGO_SIZE 15
#define OLIGO_SIZE 5
#define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
-#define UINT_BITS 32
-#define T 3 /* 11 on the rightmost two bits of bin. */
-#define C 1 /* 01 on the rightmost two bits of bin. */
-#define G 2 /* 10 on the rightmost two bits of bin. */
+#define T 3 /* 11 on the rightmost two bits of bin. */
+#define C 1 /* 01 on the rightmost two bits of bin. */
+#define G 2 /* 10 on the rightmost two bits of bin. */
+
-uint *oligo_count( char *path );
uint mask_create( int oligo_size );
-void oligo_count_output( char *path, uint *array );
+void oligo_count( char *path, uint **array_ppt, uint mask );
+void oligo_count_output( char *path, uint *array, uint mask );
void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
+
int main( int argc, char *argv[] )
{
char *path = NULL;
+ uint mask = 0;
uint *array = NULL;
- path = argv[ 1 ];
+ path = argv[ 1 ];
+
+ mask = mask_create( OLIGO_SIZE );
- array = oligo_count( path );
+ oligo_count( path, &array, mask );
- //oligo_count_output( path, array );
+ oligo_count_output( path, array, mask );
return 0;
}
-uint *oligo_count( char *path )
+uint mask_create( int oligo_size )
{
/* Martin A. Hansen, June 2008 */
- /* Count the occurence of all oligos of a fixed size in a FASTA file. */
+ /* Create a bit mask for binary encoded oligos less than 32 bits. */
- uint *array = NULL;
- uint i = 0;
- uint mask = 0;
- uint bin = 0;
- uint bin_rc1 = 0;
- uint bin_rc2 = 0;
- uint j = 0;
- uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */
- uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */
- uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */
- seq_entry *entry = NULL;
- FILE *fp = NULL;
+ uint i = 0;
+ uint mask = 0;
- array = mem_get_zero( sizeof( uint ) * SIZE );
+ for ( i = 0; i < oligo_size; i++ )
+ {
+ mask <<= 2;
+
+ mask |= 3; /* add 11 to mask */
+ }
+
+ return mask;
+}
- mask = mask_create( OLIGO_SIZE );
- entry = mem_get( sizeof( entry ) );
+void oligo_count( char *path, uint **array_ppt, uint mask )
+{
+ /* Martin A. Hansen, June 2008 */
+
+ /* Count the occurence of all oligos of a fixed size in a FASTA file. */
+
+ uint *array = *array_ppt;
+ uint i = 0;
+ uint j = 0;
+ uint bin = 0;
+ seq_entry *entry = NULL;
+ FILE *fp = NULL;
+
+ array = mem_get_zero( sizeof( uint ) * SIZE );
fp = read_open( path );
- while ( ( fasta_get_entry( fp, &entry ) ) )
+ seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ );
+
+ while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
{
fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
- bin = 0;
- bin_rc1 = 0;
- j = 0;
+ bin = 0;
+ j = 0;
for ( i = 0; entry->seq[ i ]; i++ )
{
- bin <<= 2;
- bin_rc1 >>= 2;
+ bin <<= 2;
switch( entry->seq[ i ] )
{
- case 'A': case 'a': bin_rc1 |= A_rc; j++; break;
- case 'T': case 't': bin |= T; j++; break;
- case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break;
- case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break;
- default: bin = 0; bin_rc1 = 0; j = 0; break;
+ case 'A': case 'a': j++; break;
+ case 'T': case 't': bin |= T; j++; break;
+ case 'C': case 'c': bin |= C; j++; break;
+ case 'G': case 'g': bin |= G; j++; break;
+ default: bin = 0; j = 0; break;
}
if ( j >= OLIGO_SIZE )
{
array[ ( bin & mask ) ]++;
-
- bin_rc2 = bin_rc1;
-
- bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) );
-
- array[ ( bin_rc2 ) ]++;
/*
printf( "\n" );
printf( "mask : %s\n", bits2string( mask ) );
printf( "bin : %s\n", bits2string( bin ) );
printf( "bin & mask: %s\n", bits2string( bin & mask ) );
- printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) );
- printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) );
*/
}
}
free( entry->seq );
entry = NULL;
- return array;
+ *array_ppt = array;
}
-uint mask_create( int oligo_size )
+void oligo_count_output( char *path, uint *array, uint mask )
{
/* Martin A. Hansen, June 2008 */
- /* Create a bit mask for binary encode oligos less than sizeof( uint ). */
+ /* Output oligo count for each sequence position. */
- uint i;
- uint mask;
+ uint i;
+ uint j;
+ uint bin;
+ int count;
+ uint *block;
+ uint block_pos;
+ uint block_beg;
+ uint block_size;
+ uint chr_pos;
+ seq_entry *entry;
+ FILE *fp;
- mask = 0;
+ seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ );
- for ( i = 0; i < oligo_size; i++ )
+ fp = read_open( path );
+
+ while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
{
- mask <<= 2;
+ fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
- mask |= 3;
- }
+ bin = 0;
+ j = 0;
+ block_pos = 0;
+ block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE );
+ block = mem_get_zero( block_size );
- return mask;
-}
+ for ( i = 0; entry->seq[ i ]; i++ )
+ {
+ bin <<= 2;
+ switch( entry->seq[ i ] )
+ {
+ case 'A': case 'a': j++; break;
+ case 'T': case 't': bin |= T; j++; break;
+ case 'C': case 'c': bin |= C; j++; break;
+ case 'G': case 'g': bin |= G; j++; break;
+ default: bin = 0; j = 0; break;
+ }
-//void oligo_count_output( char *path, uint *array )
-//{
-// /* Martin A. Hansen, June 2008 */
-//
-// /* Output oligo count for each sequence position. */
-//
-// struct seq_entry *entry;
-// FILE *fp;
-// uint mask;
-// uint i;
-// uint j;
-// uint bin;
-// int count;
-// uint *block;
-// uint block_pos;
-// uint block_beg;
-// uint block_size;
-// uint chr_pos;
-// file_buffer *buffer;
-//
-// mask = mask_create( OLIGO_SIZE );
-//
-// entry = mem_get( sizeof( entry ) );
-//
-// fp = read_open( path );
-//
-// while ( ( fasta_get_entry( buffer, &entry ) ) )
-// {
-// fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
-//
-// bin = 0;
-// j = 0;
-// block_pos = 0;
-// block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE );
-// block = mem_get_zero( block_size );
-//
-// for ( i = 0; entry->seq[ i ]; i++ )
-// {
-// bin <<= 2;
-//
-// switch( entry->seq[ i ] )
-// {
-// case 'A': case 'a': j++; break;
-// case 'T': case 't': bin |= T; j++; break;
-// case 'C': case 'c': bin |= C; j++; break;
-// case 'G': case 'g': bin |= G; j++; break;
-// default: bin = 0; j = 0; break;
-// }
-//
-// if ( j >= OLIGO_SIZE )
-// {
-// count = array[ ( bin & mask ) ];
-//
-// if ( count > 1 )
-// {
-// chr_pos = i - OLIGO_SIZE + 1;
-//
-// if ( block_pos == 0 )
-// {
-// memset( block, '\0', block_size );
-//
-// block_beg = chr_pos;
-//
-// block[ block_pos ] = count;
-//
-// block_pos++;
-// }
-// else
-// {
-// if ( chr_pos > block_beg + block_pos )
-// {
-// fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
-//
-// block_pos = 0;
-// }
-// else
-// {
-// block[ block_pos ] = count;
-//
-// block_pos++;
-// }
-// }
-// }
-// }
-// }
-//
-// if ( block_pos > 0 )
-// {
-// fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
-//
-// mem_free( ( void * ) &block );
-// }
-//
-// fprintf( stderr, "done.\n" );
-// }
-//
-// close_stream( fp );
-//
-// fasta_free_entry( entry );
-//}
+ if ( j >= OLIGO_SIZE )
+ {
+ count = array[ ( bin & mask ) ];
+
+ if ( count > 1 )
+ {
+ chr_pos = i - OLIGO_SIZE + 1;
+
+ if ( block_pos == 0 )
+ {
+ memset( block, '\0', block_size );
+
+ block_beg = chr_pos;
+
+ block[ block_pos ] = count;
+
+ block_pos++;
+ }
+ else
+ {
+ if ( chr_pos > block_beg + block_pos )
+ {
+ fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
+
+ block_pos = 0;
+ }
+ else
+ {
+ block[ block_pos ] = count;
+
+ block_pos++;
+ }
+ }
+ }
+ }
+ }
+
+ if ( block_pos > 0 )
+ {
+ fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
+
+ mem_free( ( void * ) &block );
+ }
+
+ fprintf( stderr, "done.\n" );
+ }
+
+ free( entry->seq_name );
+ free( entry->seq );
+ entry = NULL;
+
+ close_stream( fp );
+}
void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
}
}
}
+
+
--- /dev/null
+#include "common.h"
+#include "filesys.h"
+#include "mem.h"
+#include "seq.h"
+#include "fasta.h"
+
+#define BLOCK32 15
+#define ARRAY_SIZE ( 1 << ( BLOCK32 * 2 ) )
+
+#define BLOCK_SIZE 4
+#define MAX_SPACE 256
+#define T 3 /* 11 on the rightmost two bits of bin. */
+#define C 1 /* 01 on the rightmost two bits of bin. */
+#define G 2 /* 10 on the rightmost two bits of bin. */
+
+
+uint mask_create( int size );
+void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask );
+
+
+int main( int argc, char *argv[] )
+{
+ uint mask = 0;
+ int i = 0;
+ uint *array = NULL;
+ seq_entry *entry = NULL;
+
+ mask = mask_create( BLOCK_SIZE );
+
+ array = mem_get_zero( sizeof( uint ) * ARRAY_SIZE );
+
+ seq_new( &entry, MAX_SEQ_NAME, MAX_SEQ );
+
+ printf( "mask: %d\n", mask );
+
+ for ( i = 1; i < argc; i++ ) {
+ block_count( argv[ i ], &array, &entry, mask );
+ }
+
+ return EXIT_SUCCESS;
+}
+
+
+uint mask_create( int size )
+{
+ /* Martin A. Hansen, June 2008 */
+
+ /* Create a bit mask. */
+
+ uint i = 0;
+ uint mask = 0;
+
+ for ( i = 0; i < size; i++ )
+ {
+ mask <<= 2;
+
+ mask |= 3; /* add 11 to mask */
+ }
+
+ return mask;
+}
+
+
+void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask )
+{
+ /* Martin A. Hansen, August 2008 */
+
+ /* Scan a FASTA file for ... */
+
+ uint *array = *array_ppt;
+ seq_entry *entry = *entry_ppt;
+ FILE *fp = NULL;
+ uint i = 0;
+ uint j = 0;
+ uint bin1 = 0;
+ uint bin2 = 0;
+
+ fp = read_open( path );
+
+ fprintf( stderr, "Processing: %s\n", path );
+
+ while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
+ {
+ fprintf( stderr, " Counting blocks in: %s ... ", entry->seq_name );
+
+ bin1 = 0;
+ j = 0;
+
+ for ( i = 0; entry->seq[ i ]; i++ )
+ {
+ bin <<= 2;
+
+ switch( entry->seq[ i ] )
+ {
+ case 'A': case 'a': j++; break;
+ case 'T': case 't': bin |= T; j++; break;
+ case 'C': case 'c': bin |= C; j++; break;
+ case 'G': case 'g': bin |= G; j++; break;
+ default: bin = 0; j = 0; break;
+ }
+
+ if ( j >= BLOCK32 )
+ {
+ array[ ( bin & mask ) ]++;
+/*
+ printf( "\n" );
+ printf( "mask : %s\n", bits2string( mask ) );
+ printf( "bin : %s\n", bits2string( bin ) );
+ printf( "bin & mask: %s\n", bits2string( bin & mask ) );
+*/
+ }
+ }
+
+ fprintf( stderr, "done.\n" );
+ }
+
+ fprintf( stderr, "done.\n" );
+
+ close_stream( fp );
+}