1 /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
9 #define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
10 #define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
11 #define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
12 #define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
14 static uint mask_create( int oligo_size );
15 static void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask );
16 static void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag );
17 static void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag );
24 "repeat-O-matic determines the repetiveness of a genome by determining\n"
25 "the number of identical n-mers for each position in the genome.\n"
27 "The output is a fixedStep file ala the phastCons files from the UCSC\n"
30 "Usage: repeat-O-matic [options] <FASTA file>\n"
33 " [-n <int> | --nmer <int>] # nmer size between 1 and 15 (Default 15).\n"
34 " [-1 | --log10] # output log10 (Default no).\n"
37 " repeat-O-matic -n 14 -l hg18.fna > hg18.fixedStep\n"
39 "Copyright (C) 2008, Martin A. Hansen\n"
47 int main( int argc, char *argv[] )
51 bool log10_flag = FALSE;
56 static struct option longopts[] = {
57 { "nmer", required_argument, NULL, 'n' },
58 { "log10", no_argument, NULL, 'l' },
62 while ( ( opt = getopt_long( argc, argv, "n:l", longopts, NULL ) ) != -1 )
65 case 'n': nmer = strtol( optarg, NULL, 0 ); break;
66 case 'l': log10_flag = TRUE; break;
74 if ( nmer < 1 || nmer > 15 )
76 fprintf( stderr, "ERROR: nmer must be between 1 and 15 inclusive - not %d\n", nmer );
84 path = argv[ argc - 1 ];
86 mask = mask_create( nmer );
88 oligo_count( path, &array, nmer, mask );
90 oligo_count_output( path, array, nmer, mask, log10_flag );
96 uint mask_create( int oligo_size )
98 /* Martin A. Hansen, June 2008 */
100 /* Create a bit mask for binary encoded oligos less than 32 bits. */
105 for ( i = 0; i < oligo_size; i++ )
109 mask |= 3; /* add 11 to mask */
116 void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask )
118 /* Martin A. Hansen, June 2008 */
120 /* Count the occurence of all oligos of a fixed size in a FASTA file. */
122 uint *array = *array_ppt;
127 seq_entry *entry = NULL;
130 array_size = ( 1 << ( nmer * 2 ) );
131 array = mem_get_zero( sizeof( uint ) * array_size );
133 fp = read_open( path );
135 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
137 while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
139 fprintf( stderr, "Counting oligos in: %s ... ", entry->seq_name );
141 /* ---- Sense strand ---- */
146 for ( i = 0; entry->seq[ i ]; i++ )
150 switch( entry->seq[ i ] )
152 case 'A': case 'a': add_A( bin ); j++; break;
153 case 'T': case 't': add_T( bin ); j++; break;
154 case 'C': case 'c': add_C( bin ); j++; break;
155 case 'G': case 'g': add_G( bin ); j++; break;
156 default: bin = 0; j = 0; break;
161 array[ ( bin & mask ) ]++;
164 printf( "mask : %s\n", bits2string( mask ) );
165 printf( "bin : %s\n", bits2string( bin ) );
166 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
171 /* ---- Anti-sense strand ---- */
173 revcomp_dna( entry->seq );
178 for ( i = 0; entry->seq[ i ]; i++ )
182 switch( entry->seq[ i ] )
184 case 'A': case 'a': add_A( bin ); j++; break;
185 case 'T': case 't': add_T( bin ); j++; break;
186 case 'C': case 'c': add_C( bin ); j++; break;
187 case 'G': case 'g': add_G( bin ); j++; break;
188 default: bin = 0; j = 0; break;
193 array[ ( bin & mask ) ]++;
196 printf( "mask : %s\n", bits2string( mask ) );
197 printf( "bin : %s\n", bits2string( bin ) );
198 printf( "bin & mask: %s\n", bits2string( bin & mask ) );
203 fprintf( stderr, "done.\n" );
208 free( entry->seq_name );
216 void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag )
218 /* Martin A. Hansen, June 2008 */
220 /* Output oligo count for each sequence position. */
234 entry = seq_new( MAX_SEQ_NAME, MAX_SEQ );
236 fp = read_open( path );
238 while ( ( fasta_get_entry( fp, &entry ) ) != 0 )
240 fprintf( stderr, "Writing results for: %s ... ", entry->seq_name );
245 block_size = sizeof( uint ) * ( entry->seq_len + nmer );
246 block = mem_get_zero( block_size );
248 for ( i = 0; entry->seq[ i ]; i++ )
252 switch( entry->seq[ i ] )
254 case 'A': case 'a': add_A( bin ); j++; break;
255 case 'T': case 't': add_T( bin ); j++; break;
256 case 'C': case 'c': add_C( bin ); j++; break;
257 case 'G': case 'g': add_G( bin ); j++; break;
258 default: bin = 0; j = 0; break;
263 count = array[ ( bin & mask ) ];
267 chr_pos = i - nmer + 1;
269 if ( block_pos == 0 )
273 block[ block_pos ] = count;
279 if ( chr_pos > block_beg + block_pos )
281 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag );
285 memset( block, '\0', block_pos );
289 block[ block_pos ] = count;
300 fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag );
306 fprintf( stderr, "done.\n" );
309 free( entry->seq_name );
317 void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag )
319 /* Martin A. Hansen, June 2008 */
321 /* Outputs a block of fixedStep values. */
327 if ( block_size > 0 )
329 beg += 1; /* fixedStep format is 1 based. */
331 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
333 for ( i = 0; i < block_size; i++ ) {
334 printf( "%lf\n", log10( block_array[ i ] ) );
340 if ( block_size > 0 )
342 beg += 1; /* fixedStep format is 1 based. */
344 printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
346 for ( i = 0; i < block_size; i++ ) {
347 printf( "%i\n", block_array[ i ] );