2 Copyright (C) 2008, Martin A. Hansen
4 This program determines the repetiveness of a genome by determining
5 the number of identical 15-mers for each position in the genome.
7 The output is a fixedStep file ala the phastCons files from the UCSC
10 It is very fast and efficient using less than 8 Gb of memory to
11 complete the human genome in roughly 30 minutes.
20 // #define OLIGO_SIZE 15
22 #define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
25 #define T 3 /* 11 on the rightmost two bits of bin. */
26 #define C 1 /* 01 on the rightmost two bits of bin. */
27 #define G 2 /* 10 on the rightmost two bits of bin. */
29 uint *oligo_count( char *path );
30 uint mask_create( int oligo_size );
31 void oligo_count_output( char *path, uint *array );
32 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
34 int main( int argc, char *argv[] )
41 array = oligo_count( path );
43 //oligo_count_output( path, array );
49 uint *oligo_count( char *path )
51 /* Martin A. Hansen, June 2008 */
53 /* Count the occurence of all oligos of a fixed size in a FASTA file. */
62 uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */
63 uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */
64 uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */
65 seq_entry *entry = NULL;
68 array = mem_get_zero( sizeof( uint ) * SIZE );
70 mask = mask_create( OLIGO_SIZE );
72 entry = mem_get( sizeof( entry ) );
74 fp = read_open( path );
76 while ( ( fasta_get_entry( fp, &entry ) ) )
78 fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
84 for ( i = 0; entry->seq[ i ]; i++ )
89 switch( entry->seq[ i ] )
91 case 'A': case 'a': bin_rc1 |= A_rc; j++; break;
92 case 'T': case 't': bin |= T; j++; break;
93 case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break;
94 case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break;
95 default: bin = 0; bin_rc1 = 0; j = 0; break;
98 if ( j >= OLIGO_SIZE )
100 array[ ( bin & mask ) ]++;
104 bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) );
106 array[ ( bin_rc2 ) ]++;
109 printf( "mask : %s\n", bits2string( mask ) );
110 printf( "bin : %s\n", bits2string( bin ) );
111 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
112 printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) );
113 printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) );
118 fprintf( stderr, "done.\n" );
123 free( entry->seq_name );
131 uint mask_create( int oligo_size )
133 /* Martin A. Hansen, June 2008 */
135 /* Create a bit mask for binary encode oligos less than sizeof( uint ). */
142 for ( i = 0; i < oligo_size; i++ )
153 //void oligo_count_output( char *path, uint *array )
155 // /* Martin A. Hansen, June 2008 */
157 // /* Output oligo count for each sequence position. */
159 // struct seq_entry *entry;
171 // file_buffer *buffer;
173 // mask = mask_create( OLIGO_SIZE );
175 // entry = mem_get( sizeof( entry ) );
177 // fp = read_open( path );
179 // while ( ( fasta_get_entry( buffer, &entry ) ) )
181 // fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
186 // block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE );
187 // block = mem_get_zero( block_size );
189 // for ( i = 0; entry->seq[ i ]; i++ )
193 // switch( entry->seq[ i ] )
195 // case 'A': case 'a': j++; break;
196 // case 'T': case 't': bin |= T; j++; break;
197 // case 'C': case 'c': bin |= C; j++; break;
198 // case 'G': case 'g': bin |= G; j++; break;
199 // default: bin = 0; j = 0; break;
202 // if ( j >= OLIGO_SIZE )
204 // count = array[ ( bin & mask ) ];
208 // chr_pos = i - OLIGO_SIZE + 1;
210 // if ( block_pos == 0 )
212 // memset( block, '\0', block_size );
214 // block_beg = chr_pos;
216 // block[ block_pos ] = count;
222 // if ( chr_pos > block_beg + block_pos )
224 // fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
230 // block[ block_pos ] = count;
239 // if ( block_pos > 0 )
241 // fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
243 // mem_free( ( void * ) &block );
246 // fprintf( stderr, "done.\n" );
249 // close_stream( fp );
251 // fasta_free_entry( entry );
255 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
257 /* Martin A. Hansen, June 2008 */
259 /* Outputs a block of fixedStep values. */
263 if ( block_size > 0 )
265 beg += 1; /* fixedStep format is 1 based. */
267 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
269 for ( i = 0; i < block_size; i++ ) {
270 printf( "%u\n", block_array[ i ] );