2 Copyright (C) 2008, Martin A. Hansen
4 This program determines the repetiveness of a genome by determining
5 the number of identical 15-mers for each position in the genome.
7 The output is a fixedStep file ala the phastCons files from the UCSC
10 It is very fast and efficient using less than 8 Gb of memory to
11 complete the human genome in roughly 30 minutes.
18 // #define OLIGO_SIZE 15
20 #define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
23 #define T 3 /* 11 on the rightmost two bits of bin. */
24 #define C 1 /* 01 on the rightmost two bits of bin. */
25 #define G 2 /* 10 on the rightmost two bits of bin. */
27 uint *oligo_count( char *path );
28 uint mask_create( int oligo_size );
29 void oligo_count_output( char *path, uint *array );
30 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
32 int main( int argc, char *argv[] )
39 array = oligo_count( path );
41 oligo_count_output( path, array );
47 uint *oligo_count( char *path )
49 /* Martin A. Hansen, June 2008 */
51 /* Count the occurence of all oligos of a fixed size in a FASTA file. */
60 uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */
61 uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */
62 uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */
63 struct seq_entry *entry = NULL;
66 array = mem_get_zero( sizeof( uint ) * SIZE );
68 mask = mask_create( OLIGO_SIZE );
72 fp = read_open( path );
74 while ( ( fasta_get_entry( fp, entry ) ) )
76 fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
82 for ( i = 0; entry->seq[ i ]; i++ )
87 switch( entry->seq[ i ] )
89 case 'A': case 'a': bin_rc1 |= A_rc; j++; break;
90 case 'T': case 't': bin |= T; j++; break;
91 case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break;
92 case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break;
93 default: bin = 0; bin_rc1 = 0; j = 0; break;
96 if ( j >= OLIGO_SIZE )
98 array[ ( bin & mask ) ]++;
102 bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) );
104 array[ ( bin_rc2 ) ]++;
107 printf( "mask : %s\n", bits2string( mask ) );
108 printf( "bin : %s\n", bits2string( bin ) );
109 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
110 printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) );
111 printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) );
116 fprintf( stderr, "done.\n" );
121 fasta_free_entry( entry );
127 uint mask_create( int oligo_size )
129 /* Martin A. Hansen, June 2008 */
131 /* Create a bit mask for binary encode oligos less than sizeof( uint ). */
138 for ( i = 0; i < oligo_size; i++ )
149 void oligo_count_output( char *path, uint *array )
151 /* Martin A. Hansen, June 2008 */
153 /* Output oligo count for each sequence position. */
155 struct seq_entry *entry;
167 mask = mask_create( OLIGO_SIZE );
171 fp = read_open( path );
173 while ( ( fasta_get_entry( fp, entry ) ) )
175 fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
180 block = mem_get_zero( sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ) );
182 for ( i = 0; entry->seq[ i ]; i++ )
186 switch( entry->seq[ i ] )
188 case 'A': case 'a': j++; break;
189 case 'T': case 't': bin |= T; j++; break;
190 case 'C': case 'c': bin |= C; j++; break;
191 case 'G': case 'g': bin |= G; j++; break;
192 default: bin = 0; j = 0; break;
195 if ( j >= OLIGO_SIZE )
197 count = array[ ( bin & mask ) ];
201 chr_pos = i - OLIGO_SIZE + 1;
203 if ( block_pos == 0 )
209 block[ block_pos ] = count;
215 if ( chr_pos > block_beg + block_pos )
217 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
223 block[ block_pos ] = count;
234 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
239 fprintf( stderr, "done.\n" );
244 fasta_free_entry( entry );
248 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
250 /* Martin A. Hansen, June 2008 */
252 /* Outputs a block of fixedStep values. */
256 if ( block_size > 0 )
258 beg += 1; /* fixedStep format is 1 based. */
260 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
262 for ( i = 0; i < block_size; i++ ) {
263 printf( "%u\n", block_array[ i ] );