From: martinahansen Date: Mon, 25 Aug 2008 07:59:34 +0000 (+0000) Subject: clean-write of repeat-O-matic X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=4daef7eb366ff2842da5b0cec91467d9349d7b76;p=biopieces.git clean-write of repeat-O-matic git-svn-id: http://biopieces.googlecode.com/svn/trunk@222 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/gmon.out b/code_c/Maasha/src/gmon.out index 69f3a07..41152f8 100644 Binary files a/code_c/Maasha/src/gmon.out and b/code_c/Maasha/src/gmon.out differ diff --git a/code_c/Maasha/src/repeat-O-matic.c b/code_c/Maasha/src/repeat-O-matic.c index 9821129..7dc3996 100644 --- a/code_c/Maasha/src/repeat-O-matic.c +++ b/code_c/Maasha/src/repeat-O-matic.c @@ -1,53 +1,94 @@ -/* - Copyright (C) 2008, Martin A. Hansen - - This program determines the repetiveness of a genome by determining - the number of identical 15-mers for each position in the genome. - - The output is a fixedStep file ala the phastCons files from the UCSC - Genome browser. - - It is very fast and efficient using less than 8 Gb of memory to - complete the human genome in roughly 30 minutes. -*/ - +#include #include "common.h" #include "mem.h" #include "filesys.h" #include "seq.h" #include "fasta.h" - -#define OLIGO_SIZE 15 -//#define OLIGO_SIZE 5 -#define SIZE ( 1 << ( OLIGO_SIZE * 2 ) ) - -#define T 3 /* 11 on the rightmost two bits of bin. */ -#define C 1 /* 01 on the rightmost two bits of bin. */ -#define G 2 /* 10 on the rightmost two bits of bin. */ +#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ +#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ +#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ +#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ + +static uint mask_create( int oligo_size ); +static void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask ); +static void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag ); +static void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag ); -uint mask_create( int oligo_size ); -void oligo_count( char *path, uint **array_ppt, uint mask ); -void oligo_count_output( char *path, uint *array, uint mask ); -void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ); +static void usage() +{ + fprintf( stderr, + "\n" + "repeat-O-matic determines the repetiveness of a genome by determining\n" + "the number of identical n-mers for each position in the genome.\n" + "\n" + "The output is a fixedStep file ala the phastCons files from the UCSC\n" + "Genome browser.\n" + "\n" + "Usage: repeat-O-matic [options] \n" + "\n" + "Options:\n" + " [-n | --nmer ] # nmer size between 1 and 15 (Default 15).\n" + " [-1 | --log10] # output log10 (Default no).\n" + "\n" + "Examples:\n" + " repeat-O-matic -n 14 -l hg18.fna > hg18.fixedStep\n" + "\n" + "Copyright (C) 2008, Martin A. Hansen\n" + "\n" + ); + + exit( EXIT_FAILURE ); +} int main( int argc, char *argv[] ) { - char *path = NULL; - uint mask = 0; - uint *array = NULL; + int opt = 0; + uint nmer = 15; + bool log10_flag = FALSE; + char *path = NULL; + uint mask = 0; + uint *array = NULL; + + static struct option longopts[] = { + { "nmer", required_argument, NULL, 'n' }, + { "log10", no_argument, NULL, 'l' }, + { NULL, 0, NULL, 0 } + }; + + while ( ( opt = getopt_long( argc, argv, "n:l", longopts, NULL ) ) != -1 ) + { + switch ( opt ) { + case 'n': nmer = strtol( optarg, NULL, 0 ); break; + case 'l': log10_flag = TRUE; break; + default: break; + } + } - path = argv[ 1 ]; + argc -= optind; + argv += optind; - mask = mask_create( OLIGO_SIZE ); + if ( nmer < 1 || nmer > 15 ) + { + fprintf( stderr, "ERROR: nmer must be between 1 and 15 inclusive - not %d\n", nmer ); + abort(); + } - oligo_count( path, &array, mask ); + if ( argc < 1 ) { + usage(); + } + + path = argv[ argc - 1 ]; + + mask = mask_create( nmer ); + + oligo_count( path, &array, nmer, mask ); - oligo_count_output( path, array, mask ); + oligo_count_output( path, array, nmer, mask, log10_flag ); - return 0; + return EXIT_SUCCESS; } @@ -71,20 +112,22 @@ uint mask_create( int oligo_size ) } -void oligo_count( char *path, uint **array_ppt, uint mask ) +void oligo_count( char *path, uint **array_ppt, uint nmer, uint mask ) { /* Martin A. Hansen, June 2008 */ /* Count the occurence of all oligos of a fixed size in a FASTA file. */ - uint *array = *array_ppt; - uint i = 0; - uint j = 0; - uint bin = 0; - seq_entry *entry = NULL; - FILE *fp = NULL; + uint *array = *array_ppt; + uint array_size = 0; + uint i = 0; + uint j = 0; + uint bin = 0; + seq_entry *entry = NULL; + FILE *fp = NULL; - array = mem_get_zero( sizeof( uint ) * SIZE ); + array_size = ( 1 << ( nmer * 2 ) ); + array = mem_get_zero( sizeof( uint ) * array_size ); fp = read_open( path ); @@ -105,14 +148,14 @@ void oligo_count( char *path, uint **array_ppt, uint mask ) switch( entry->seq[ i ] ) { - case 'A': case 'a': j++; break; - case 'T': case 't': bin |= T; j++; break; - case 'C': case 'c': bin |= C; j++; break; - case 'G': case 'g': bin |= G; j++; break; + case 'A': case 'a': add_A( bin ); j++; break; + case 'T': case 't': add_T( bin ); j++; break; + case 'C': case 'c': add_C( bin ); j++; break; + case 'G': case 'g': add_G( bin ); j++; break; default: bin = 0; j = 0; break; } - if ( j >= OLIGO_SIZE ) + if ( j >= nmer ) { array[ ( bin & mask ) ]++; /* @@ -137,14 +180,14 @@ void oligo_count( char *path, uint **array_ppt, uint mask ) switch( entry->seq[ i ] ) { - case 'A': case 'a': j++; break; - case 'T': case 't': bin |= T; j++; break; - case 'C': case 'c': bin |= C; j++; break; - case 'G': case 'g': bin |= G; j++; break; + case 'A': case 'a': add_A( bin ); j++; break; + case 'T': case 't': add_T( bin ); j++; break; + case 'C': case 'c': add_C( bin ); j++; break; + case 'G': case 'g': add_G( bin ); j++; break; default: bin = 0; j = 0; break; } - if ( j >= OLIGO_SIZE ) + if ( j >= nmer ) { array[ ( bin & mask ) ]++; /* @@ -169,7 +212,7 @@ void oligo_count( char *path, uint **array_ppt, uint mask ) } -void oligo_count_output( char *path, uint *array, uint mask ) +void oligo_count_output( char *path, uint *array, uint nmer, uint mask, bool log10_flag ) { /* Martin A. Hansen, June 2008 */ @@ -198,7 +241,7 @@ void oligo_count_output( char *path, uint *array, uint mask ) bin = 0; j = 0; block_pos = 0; - block_size = sizeof( uint ) * ( entry->seq_len + OLIGO_SIZE ); + block_size = sizeof( uint ) * ( entry->seq_len + nmer ); block = mem_get_zero( block_size ); for ( i = 0; entry->seq[ i ]; i++ ) @@ -207,20 +250,20 @@ void oligo_count_output( char *path, uint *array, uint mask ) switch( entry->seq[ i ] ) { - case 'A': case 'a': j++; break; - case 'T': case 't': bin |= T; j++; break; - case 'C': case 'c': bin |= C; j++; break; - case 'G': case 'g': bin |= G; j++; break; + case 'A': case 'a': add_A( bin ); j++; break; + case 'T': case 't': add_T( bin ); j++; break; + case 'C': case 'c': add_C( bin ); j++; break; + case 'G': case 'g': add_G( bin ); j++; break; default: bin = 0; j = 0; break; } - if ( j >= OLIGO_SIZE ) + if ( j >= nmer ) { count = array[ ( bin & mask ) ]; if ( count > 1 ) { - chr_pos = i - OLIGO_SIZE + 1; + chr_pos = i - nmer + 1; if ( block_pos == 0 ) { @@ -236,7 +279,7 @@ void oligo_count_output( char *path, uint *array, uint mask ) { if ( chr_pos > block_beg + block_pos ) { - fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag ); block_pos = 0; } @@ -253,7 +296,7 @@ void oligo_count_output( char *path, uint *array, uint mask ) if ( block_pos > 0 ) { - fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos ); + fixedstep_put_entry( entry->seq_name, block_beg, 1, block, block_pos, log10_flag ); mem_free( ( void * ) &block ); } @@ -269,7 +312,7 @@ void oligo_count_output( char *path, uint *array, uint mask ) } -void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size ) +void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int block_size, bool log10_flag ) { /* Martin A. Hansen, June 2008 */ @@ -277,14 +320,30 @@ void fixedstep_put_entry( char *chr, int beg, int step_size, uint *block_array, int i; - if ( block_size > 0 ) + if ( log10_flag ) { - beg += 1; /* fixedStep format is 1 based. */ + if ( block_size > 0 ) + { + beg += 1; /* fixedStep format is 1 based. */ - printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size ); + printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size ); - for ( i = 0; i < block_size; i++ ) { - printf( "%u\n", block_array[ i ] ); + for ( i = 0; i < block_size; i++ ) { + printf( "%lf\n", log10( block_array[ i ] ) ); + } + } + } + else + { + if ( block_size > 0 ) + { + beg += 1; /* fixedStep format is 1 based. */ + + printf( "fixedStep chrom=%s start=%d step=%d\n", chr, beg, step_size ); + + for ( i = 0; i < block_size; i++ ) { + printf( "%i\n", block_array[ i ] ); + } } } } diff --git a/code_c/Maasha/src/test.c b/code_c/Maasha/src/test.c index 6a89d40..431d7da 100644 --- a/code_c/Maasha/src/test.c +++ b/code_c/Maasha/src/test.c @@ -24,7 +24,7 @@ int main( int argc, char *argv[] ) } -double avarage( int num, ... ) +double average( int num, ... ) { /* Martin A. Hansen, July 2008 */ diff --git a/code_c/Maasha/src/tetra_count.c b/code_c/Maasha/src/tetra_count.c index 60de794..c47b08b 100644 --- a/code_c/Maasha/src/tetra_count.c +++ b/code_c/Maasha/src/tetra_count.c @@ -73,7 +73,6 @@ void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask uint i = 0; uint j = 0; uint bin1 = 0; - uint bin2 = 0; fp = read_open( path ); @@ -88,20 +87,20 @@ void block_count( char *path, uint **array_ppt, seq_entry **entry_ppt, uint mask for ( i = 0; entry->seq[ i ]; i++ ) { - bin <<= 2; + bin1 <<= 2; switch( entry->seq[ i ] ) { case 'A': case 'a': j++; break; - case 'T': case 't': bin |= T; j++; break; - case 'C': case 'c': bin |= C; j++; break; - case 'G': case 'g': bin |= G; j++; break; - default: bin = 0; j = 0; break; + case 'T': case 't': bin1 |= T; j++; break; + case 'C': case 'c': bin1 |= C; j++; break; + case 'G': case 'g': bin1 |= G; j++; break; + default: bin1 = 0; j = 0; break; } if ( j >= BLOCK32 ) { - array[ ( bin & mask ) ]++; + array[ ( bin1 & mask ) ]++; /* printf( "\n" ); printf( "mask : %s\n", bits2string( mask ) );