2 Copyright (C) 2008, Martin A. Hansen
4 This program determines the repetiveness of a genome by determining
5 the number of identical 15-mers for each position in the genome.
7 The output is a fixedStep file ala the phastCons files from the UCSC
10 It is very fast and efficient using less than 8 Gb of memory to
11 complete the human genome in roughly 30 minutes.
19 // #define OLIGO_SIZE 15
21 #define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
24 #define T 3 /* 11 on the rightmost two bits of bin. */
25 #define C 1 /* 01 on the rightmost two bits of bin. */
26 #define G 2 /* 10 on the rightmost two bits of bin. */
28 uint *oligo_count( char *path );
29 uint mask_create( int oligo_size );
30 void oligo_count_output( char *path, uint *array );
31 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
33 int main( int argc, char *argv[] )
40 array = oligo_count( path );
42 //oligo_count_output( path, array );
48 uint *oligo_count( char *path )
50 /* Martin A. Hansen, June 2008 */
52 /* Count the occurence of all oligos of a fixed size in a FASTA file. */
61 uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */
62 uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */
63 uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */
64 seq_entry *entry = NULL;
66 file_buffer *buffer = NULL;
68 array = mem_get_zero( sizeof( uint ) * SIZE );
70 mask = mask_create( OLIGO_SIZE );
72 entry = mem_get( sizeof( entry ) );
74 fp = read_open( path );
76 while ( ( fasta_get_entry( buffer, &entry ) ) )
78 fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
84 for ( i = 0; entry->seq[ i ]; i++ )
89 switch( entry->seq[ i ] )
91 case 'A': case 'a': bin_rc1 |= A_rc; j++; break;
92 case 'T': case 't': bin |= T; j++; break;
93 case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break;
94 case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break;
95 default: bin = 0; bin_rc1 = 0; j = 0; break;
98 if ( j >= OLIGO_SIZE )
100 array[ ( bin & mask ) ]++;
104 bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) );
106 array[ ( bin_rc2 ) ]++;
109 printf( "mask : %s\n", bits2string( mask ) );
110 printf( "bin : %s\n", bits2string( bin ) );
111 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
112 printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) );
113 printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) );
118 fprintf( stderr, "done.\n" );
123 fasta_free_entry( entry );
129 uint mask_create( int oligo_size )
131 /* Martin A. Hansen, June 2008 */
133 /* Create a bit mask for binary encode oligos less than sizeof( uint ). */
140 for ( i = 0; i < oligo_size; i++ )
151 //void oligo_count_output( char *path, uint *array )
153 // /* Martin A. Hansen, June 2008 */
155 // /* Output oligo count for each sequence position. */
157 // struct seq_entry *entry;
169 // file_buffer *buffer;
171 // mask = mask_create( OLIGO_SIZE );
173 // entry = mem_get( sizeof( entry ) );
175 // fp = read_open( path );
177 // while ( ( fasta_get_entry( buffer, &entry ) ) )
179 // fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
184 // block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE );
185 // block = mem_get_zero( block_size );
187 // for ( i = 0; entry->seq[ i ]; i++ )
191 // switch( entry->seq[ i ] )
193 // case 'A': case 'a': j++; break;
194 // case 'T': case 't': bin |= T; j++; break;
195 // case 'C': case 'c': bin |= C; j++; break;
196 // case 'G': case 'g': bin |= G; j++; break;
197 // default: bin = 0; j = 0; break;
200 // if ( j >= OLIGO_SIZE )
202 // count = array[ ( bin & mask ) ];
206 // chr_pos = i - OLIGO_SIZE + 1;
208 // if ( block_pos == 0 )
210 // memset( block, '\0', block_size );
212 // block_beg = chr_pos;
214 // block[ block_pos ] = count;
220 // if ( chr_pos > block_beg + block_pos )
222 // fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
228 // block[ block_pos ] = count;
237 // if ( block_pos > 0 )
239 // fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
241 // mem_free( ( void * ) &block );
244 // fprintf( stderr, "done.\n" );
247 // close_stream( fp );
249 // fasta_free_entry( entry );
253 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
255 /* Martin A. Hansen, June 2008 */
257 /* Outputs a block of fixedStep values. */
261 if ( block_size > 0 )
263 beg += 1; /* fixedStep format is 1 based. */
265 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
267 for ( i = 0; i < block_size; i++ ) {
268 printf( "%u\n", block_array[ i ] );