2 Copyright (C) 2008, Martin A. Hansen
4 This program determines the repetiveness of a genome by determining
5 the number of identical 15-mers for each position in the genome.
7 The output is a fixedStep file ala the phastCons files from the UCSC
10 It is very fast and efficient using less than 8 Gb of memory to
11 complete the human genome in roughly 30 minutes.
19 // #define OLIGO_SIZE 15
21 #define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
24 #define T 3 /* 11 on the rightmost two bits of bin. */
25 #define C 1 /* 01 on the rightmost two bits of bin. */
26 #define G 2 /* 10 on the rightmost two bits of bin. */
28 uint *oligo_count( char *path );
29 uint mask_create( int oligo_size );
30 void oligo_count_output( char *path, uint *array );
31 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
33 int main( int argc, char *argv[] )
40 array = oligo_count( path );
42 oligo_count_output( path, array );
48 uint *oligo_count( char *path )
50 /* Martin A. Hansen, June 2008 */
52 /* Count the occurence of all oligos of a fixed size in a FASTA file. */
61 uint A_rc = ( 3 << ( UINT_BITS - 2 ) ); /* 11 on the leftmost two bits an uint. */
62 uint G_rc = ( 2 << ( UINT_BITS - 2 ) ); /* 10 on the leftmost two bits an uint. */
63 uint C_rc = ( 1 << ( UINT_BITS - 2 ) ); /* 01 on the leftmost two bits an uint. */
64 struct seq_entry *entry = NULL;
67 array = mem_get_zero( sizeof( uint ) * SIZE );
69 mask = mask_create( OLIGO_SIZE );
73 fp = read_open( path );
75 while ( ( fasta_get_entry( fp, entry ) ) )
77 fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
83 for ( i = 0; entry->seq[ i ]; i++ )
88 switch( entry->seq[ i ] )
90 case 'A': case 'a': bin_rc1 |= A_rc; j++; break;
91 case 'T': case 't': bin |= T; j++; break;
92 case 'C': case 'c': bin |= C; bin_rc1 |= G_rc; j++; break;
93 case 'G': case 'g': bin |= G; bin_rc1 |= C_rc; j++; break;
94 default: bin = 0; bin_rc1 = 0; j = 0; break;
97 if ( j >= OLIGO_SIZE )
99 array[ ( bin & mask ) ]++;
103 bin_rc2 >>= ( UINT_BITS - ( OLIGO_SIZE * 2 ) );
105 array[ ( bin_rc2 ) ]++;
108 printf( "mask : %s\n", bits2string( mask ) );
109 printf( "bin : %s\n", bits2string( bin ) );
110 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
111 printf( "bin_rc1 : %s\n", bits2string( bin_rc1 ) );
112 printf( "bin_rc2 : %s\n", bits2string( bin_rc2 ) );
117 fprintf( stderr, "done.\n" );
122 fasta_free_entry( entry );
128 uint mask_create( int oligo_size )
130 /* Martin A. Hansen, June 2008 */
132 /* Create a bit mask for binary encode oligos less than sizeof( uint ). */
139 for ( i = 0; i < oligo_size; i++ )
150 void oligo_count_output( char *path, uint *array )
152 /* Martin A. Hansen, June 2008 */
154 /* Output oligo count for each sequence position. */
156 struct seq_entry *entry;
168 mask = mask_create( OLIGO_SIZE );
172 fp = read_open( path );
174 while ( ( fasta_get_entry( fp, entry ) ) )
176 fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
181 block = mem_get_zero( sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ) );
183 for ( i = 0; entry->seq[ i ]; i++ )
187 switch( entry->seq[ i ] )
189 case 'A': case 'a': j++; break;
190 case 'T': case 't': bin |= T; j++; break;
191 case 'C': case 'c': bin |= C; j++; break;
192 case 'G': case 'g': bin |= G; j++; break;
193 default: bin = 0; j = 0; break;
196 if ( j >= OLIGO_SIZE )
198 count = array[ ( bin & mask ) ];
202 chr_pos = i - OLIGO_SIZE + 1;
204 if ( block_pos == 0 )
210 block[ block_pos ] = count;
216 if ( chr_pos > block_beg + block_pos )
218 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
224 block[ block_pos ] = count;
235 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
240 fprintf( stderr, "done.\n" );
245 fasta_free_entry( entry );
249 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
251 /* Martin A. Hansen, June 2008 */
253 /* Outputs a block of fixedStep values. */
257 if ( block_size > 0 )
259 beg += 1; /* fixedStep format is 1 based. */
261 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
263 for ( i = 0; i < block_size; i++ ) {
264 printf( "%u\n", block_array[ i ] );