-/*
- Copyright (C) 2008, Martin A. Hansen
-
- This program determines the repetiveness of a genome by determining
- the number of identical 15-mers for each position in the genome.
-
- The output is a fixedStep file ala the phastCons files from the UCSC
- Genome browser.
-
- It is very fast and efficient using less than 8 Gb of memory to
- complete the human genome in roughly 30 minutes.
-*/
-
+#include <getopt.h>
#include "common.h"
#include "mem.h"
#include "filesys.h"
#include "seq.h"
#include "fasta.h"
-
-#define OLIGO_SIZE 15
-//#define OLIGO_SIZE 5
-#define SIZE ( 1 << ( OLIGO_SIZE * 2 ) )
-
-#define T 3 /* 11 on the rightmost two bits of bin. */
-#define C 1 /* 01 on the rightmost two bits of bin. */
-#define G 2 /* 10 on the rightmost two bits of bin. */
+#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
+#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
+#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
+#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
+
+static uint mask_create( int oligo_size );
+static void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask );
+static void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag );
+static void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag );
-uint mask_create( int oligo_size );
-void oligo_count( char *path, uint **array_ppt, uint mask );
-void oligo_count_output( char *path, uint *array, uint mask );
-void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size );
+static void usage()
+{
+ fprintf( stderr,
+ "\n"
+ "repeat-O-matic determines the repetiveness of a genome by determining\n"
+ "the number of identical n-mers for each position in the genome.\n"
+ "\n"
+ "The output is a fixedStep file ala the phastCons files from the UCSC\n"
+ "Genome browser.\n"
+ "\n"
+ "Usage: repeat-O-matic [options] <FASTA file>\n"
+ "\n"
+ "Options:\n"
+ " [-n <int> | --nmer <int>] # nmer size between 1 and 15 (Default 15).\n"
+ " [-1 | --log10] # output log10 (Default no).\n"
+ "\n"
+ "Examples:\n"
+ " repeat-O-matic -n 14 -l hg18.fna > hg18.fixedStep\n"
+ "\n"
+ "Copyright (C) 2008, Martin A. Hansen\n"
+ "\n"
+ );
+
+ exit( EXIT_FAILURE );
+}
int main( int argc, char *argv[] )
{
- char *path = NULL;
- uint mask = 0;
- uint *array = NULL;
+ int opt = 0;
+ uint nmer = 15;
+ bool log10_flag = FALSE;
+ char *path = NULL;
+ uint mask = 0;
+ uint *array = NULL;
+
+ static struct option longopts[] = {
+ { "nmer", required_argument, NULL, 'n' },
+ { "log10", no_argument, NULL, 'l' },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ( ( opt = getopt_long( argc, argv, "n:l", longopts, NULL ) ) != -1 )
+ {
+ switch ( opt ) {
+ case 'n': nmer = strtol( optarg, NULL, 0 ); break;
+ case 'l': log10_flag = TRUE; break;
+ default: break;
+ }
+ }
- path = argv[ 1 ];
+ argc -= optind;
+ argv += optind;
- mask = mask_create( OLIGO_SIZE );
+ if ( nmer < 1 || nmer > 15 )
+ {
+ fprintf( stderr, "ERROR: nmer must be between 1 and 15 inclusive - not %d\n", nmer );
+ abort();
+ }
- oligo_count( path, &array, mask );
+ if ( argc < 1 ) {
+ usage();
+ }
+
+ path = argv[ argc - 1 ];
+
+ mask = mask_create( nmer );
+
+ oligo_count( path, &array, nmer, mask );
- oligo_count_output( path, array, mask );
+ oligo_count_output( path, array, nmer, mask, log10_flag );
- return 0;
+ return EXIT_SUCCESS;
}
}
-void oligo_count( char *path, uint **array_ppt, uint mask )
+void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask )
{
/* Martin A. Hansen, June 2008 */
/* Count the occurence of all oligos of a fixed size in a FASTA file. */
- uint *array = *array_ppt;
- uint i = 0;
- uint j = 0;
- uint bin = 0;
- seq_entry *entry = NULL;
- FILE *fp = NULL;
+ uint *array = *array_ppt;
+ uint array_size = 0;
+ uint i = 0;
+ uint j = 0;
+ uint bin = 0;
+ seq_entry *entry = NULL;
+ FILE *fp = NULL;
- array = mem_get_zero( sizeof( uint ) * SIZE );
+ array_size = ( 1 << ( nmer * 2 ) );
+ array = mem_get_zero( sizeof( uint ) * array_size );
fp = read_open( path );
switch( entry->seq[ i ] )
{
- case 'A': case 'a': j++; break;
- case 'T': case 't': bin |= T; j++; break;
- case 'C': case 'c': bin |= C; j++; break;
- case 'G': case 'g': bin |= G; j++; break;
+ case 'A': case 'a': add_A( bin ); j++; break;
+ case 'T': case 't': add_T( bin ); j++; break;
+ case 'C': case 'c': add_C( bin ); j++; break;
+ case 'G': case 'g': add_G( bin ); j++; break;
default: bin = 0; j = 0; break;
}
- if ( j >= OLIGO_SIZE )
+ if ( j >= nmer )
{
array[ ( bin & mask ) ]++;
/*
switch( entry->seq[ i ] )
{
- case 'A': case 'a': j++; break;
- case 'T': case 't': bin |= T; j++; break;
- case 'C': case 'c': bin |= C; j++; break;
- case 'G': case 'g': bin |= G; j++; break;
+ case 'A': case 'a': add_A( bin ); j++; break;
+ case 'T': case 't': add_T( bin ); j++; break;
+ case 'C': case 'c': add_C( bin ); j++; break;
+ case 'G': case 'g': add_G( bin ); j++; break;
default: bin = 0; j = 0; break;
}
- if ( j >= OLIGO_SIZE )
+ if ( j >= nmer )
{
array[ ( bin & mask ) ]++;
/*
}
-void oligo_count_output( char *path, uint *array, uint mask )
+void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag )
{
/* Martin A. Hansen, June 2008 */
bin = 0;
j = 0;
block_pos = 0;
- block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE );
+ block_size = sizeof( uint ) * ( entry->seq_len + nmer );
block = mem_get_zero( block_size );
for ( i = 0; entry->seq[ i ]; i++ )
switch( entry->seq[ i ] )
{
- case 'A': case 'a': j++; break;
- case 'T': case 't': bin |= T; j++; break;
- case 'C': case 'c': bin |= C; j++; break;
- case 'G': case 'g': bin |= G; j++; break;
+ case 'A': case 'a': add_A( bin ); j++; break;
+ case 'T': case 't': add_T( bin ); j++; break;
+ case 'C': case 'c': add_C( bin ); j++; break;
+ case 'G': case 'g': add_G( bin ); j++; break;
default: bin = 0; j = 0; break;
}
- if ( j >= OLIGO_SIZE )
+ if ( j >= nmer )
{
count = array[ ( bin & mask ) ];
if ( count > 1 )
{
- chr_pos = i - OLIGO_SIZE + 1;
+ chr_pos = i - nmer + 1;
if ( block_pos == 0 )
{
{
if ( chr_pos > block_beg + block_pos )
{
- fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
+ fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag );
block_pos = 0;
}
if ( block_pos > 0 )
{
- fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos );
+ fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag );
mem_free( ( void * ) &block );
}
}
-void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size )
+void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag )
{
/* Martin A. Hansen, June 2008 */
int i;
- if ( block_size > 0 )
+ if ( log10_flag )
{
- beg += 1; /* fixedStep format is 1 based. */
+ if ( block_size > 0 )
+ {
+ beg += 1; /* fixedStep format is 1 based. */
- printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
+ printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
- for ( i = 0; i < block_size; i++ ) {
- printf( "%u\n", block_array[ i ] );
+ for ( i = 0; i < block_size; i++ ) {
+ printf( "%lf\n", log10( block_array[ i ] ) );
+ }
+ }
+ }
+ else
+ {
+ if ( block_size > 0 )
+ {
+ beg += 1; /* fixedStep format is 1 based. */
+
+ printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size );
+
+ for ( i = 0; i < block_size; i++ ) {
+ printf( "%i\n", block_array[ i ] );
+ }
}
}
}