From 9c630c5acca3afc62a16626453d57ca4dde8eb5e Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 24 Sep 2008 04:34:12 +0000 Subject: [PATCH] adding rescan to bipartite_scan git-svn-id: http://biopieces.googlecode.com/svn/trunk@267 74ccb610-7750-0410-82ae-013aeee3265d --- code_c/Maasha/src/bipartite_decode.c | 38 +----- code_c/Maasha/src/bipartite_scan.c | 189 +++++++++++++++++++++++++-- code_c/Maasha/src/inc/seq.h | 17 ++- code_c/Maasha/src/lib/seq.c | 38 ++++++ code_perl/Maasha/Calc.pm | 33 +++++ 5 files changed, 265 insertions(+), 50 deletions(-) diff --git a/code_c/Maasha/src/bipartite_decode.c b/code_c/Maasha/src/bipartite_decode.c index 0bc706d..a15474f 100644 --- a/code_c/Maasha/src/bipartite_decode.c +++ b/code_c/Maasha/src/bipartite_decode.c @@ -2,48 +2,12 @@ #include "common.h" #include "filesys.h" +#include "seq.h" #define BITS_IN_BYTE 8 /* number of bits in one byte. */ #define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */ #define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */ -/* Byte array for fast convertion of binary blocks back to DNA. */ -char *bin2dna[256] = { - "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT", - "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT", - "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT", - "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT", - "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT", - "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT", - "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT", - "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT", - "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT", - "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT", - "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT", - "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT", - "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT", - "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT", - "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT", - "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT", - "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT", - "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT", - "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT", - "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT", - "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT", - "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT", - "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT", - "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT", - "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT", - "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT", - "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT", - "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT", - "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT", - "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT", - "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT", - "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT" -}; - - /* Function declarations. */ void run_decode( int argc, char *argv[] ); void print_usage(); diff --git a/code_c/Maasha/src/bipartite_scan.c b/code_c/Maasha/src/bipartite_scan.c index bb78a91..ee01f07 100644 --- a/code_c/Maasha/src/bipartite_scan.c +++ b/code_c/Maasha/src/bipartite_scan.c @@ -13,13 +13,8 @@ #define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */ #define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */ #define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */ -#define CUTOFF 10000 /* minimum number of motifs in output. */ +#define CUTOFF 1 /* minimum number of motifs in output. */ -#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ -#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ -#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ -#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ - /* Structure that will hold one tetra nucleotide block. */ struct _bitblock { @@ -33,9 +28,12 @@ typedef struct _bitblock bitblock; void run_scan( int argc, char *argv[] ); void print_usage(); void scan_file( char *file, seq_entry *entry, uint *count_array ); +void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff ); uint *count_array_new( size_t nmemb ); void scan_seq( char *seq, size_t seq_len, uint *count_array ); +void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff ); void scan_list( list_sl *list, uint *count_array ); +void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff ); bitblock *bitblock_new(); uint blocks2motif( uchar bin1, uchar bin2, ushort dist ); void count_array_print( uint *count_array, size_t nmemb, size_t cutoff ); @@ -78,9 +76,7 @@ void print_usage() /* Print usage and exit if no files in argument. */ - fprintf( stderr, - "Usage: bipartite_scam > result.csv\n" - ); + fprintf( stderr, "Usage: bipartite_scam > result.csv\n" ); exit( EXIT_SUCCESS ); } @@ -117,6 +113,12 @@ void run_scan( int argc, char *argv[] ) count_array_print( count_array, COUNT_ARRAY_NMEMB, CUTOFF ); fprintf( stderr, "done.\n" ); + file = argv[ 1 ]; + + fprintf( stderr, "Rescanning file: %s\n", file ); + rescan_file( file, entry, count_array, CUTOFF ); + fprintf( stderr, "done.\n" ); + seq_destroy( entry ); mem_free( &count_array ); @@ -143,8 +145,7 @@ void scan_file( char *file, seq_entry *entry, uint *count_array ) { /* Martin A. Hansen, September 2008 */ - /* Scan all FASTA entries of a file in both */ - /* sense and anti-sense directions. */ + /* Scan all FASTA entries of a file. */ FILE *fp = read_open( file ); @@ -161,6 +162,27 @@ void scan_file( char *file, seq_entry *entry, uint *count_array ) } +void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Rescan all FASTA entries of a file. */ + + FILE *fp = read_open( file ); + + while ( fasta_get_entry( fp, &entry ) == TRUE ) + { + fprintf( stderr, " Rescanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len ); + + rescan_seq( entry->seq, entry->seq_len, count_array, cutoff ); + + fprintf( stderr, "done.\n" ); + } + + close_stream( fp ); +} + + void scan_seq( char *seq, size_t seq_len, uint *count_array ) { /* Martin A. Hansen, September 2008 */ @@ -245,6 +267,90 @@ void scan_seq( char *seq, size_t seq_len, uint *count_array ) } +void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Run a sliding window over a given sequence. The window */ + /* consists of a list where new blocks of 4 nucleotides */ + /* are pushed onto one end while at the same time old */ + /* blocks are popped from the other end. The number of */ + /* in the list is determined by the maximum seperator. */ + /* Everytime we have a full window, the window is scanned */ + /* for motifs. */ + + bitblock *block = NULL; + size_t b_count = 0; + ushort n_count = 0; + size_t i = 0; + uchar bin = 0; + bool first_node = TRUE; + node_sl *new_node = NULL; + node_sl *old_node = NULL; + list_sl *list = list_sl_new(); + + for ( i = 0; seq[ i ]; i++ ) + { + bin <<= BITS_IN_NT; + + switch( seq[ i ] ) + { + case 'A': case 'a': add_A( bin ); break; + case 'T': case 't': add_T( bin ); break; + case 'C': case 'c': add_C( bin ); break; + case 'G': case 'g': add_G( bin ); break; + default: n_count = BLOCK_SIZE_NT; break; + } + + if ( i > BLOCK_SIZE_NT - 2 ) + { + b_count++; + + block = bitblock_new(); + block->bin = bin; + + if ( n_count > 0 ) + { + block->hasN = TRUE; + n_count--; + } + + new_node = node_sl_new(); + new_node->val = block; + + if ( first_node ) { + list_sl_add_beg( &list, &new_node ); + } else { + list_sl_add_after( &old_node, &new_node ); + } + + old_node = new_node; + + first_node = FALSE; + + if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT ) + { + // bitblock_list_print( list ); /* DEBUG */ + + rescan_list( list, count_array, i, cutoff ); + + list_sl_remove_beg( &list ); + } + } + } + + /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */ + if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT ) + { + // bitblock_list_print( list ); /* DEBUG */ + + rescan_list( list, count_array, i, cutoff ); + } + + list_sl_destroy( &list ); +} + + void scan_list( list_sl *list, uint *count_array ) { /* Martin A. Hansen, September 2008 */ @@ -295,6 +401,67 @@ void scan_list( list_sl *list, uint *count_array ) } +void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff ) +{ + /* Martin A. Hansen, September 2008 */ + + /* Scan a list of blocks for biparite motifs by creating */ + /* a binary motif consisting of two blocks of 4 nucleotides */ + /* along with the distance separating them. Motifs containing */ + /* N's are skipped. */ + + node_sl *first_node = NULL; + node_sl *next_node = NULL; + bitblock *block1 = NULL; + bitblock *block2 = NULL; + int i = 0; + ushort dist = 0; + uint motif_bin = 0; + uint j = 0; + uint count = 0; + +// bitblock_list_print( list ); + + first_node = list->first; + + block1 = ( bitblock * ) first_node->val; + + if ( ! block1->hasN ) + { + next_node = first_node->next; + + for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ ) + { + next_node = next_node->next; + + j++; + } + + for ( next_node = next_node; next_node != NULL; next_node = next_node->next ) + { + block2 = ( bitblock * ) next_node->val; + + if ( ! block2->hasN ) + { + motif_bin = blocks2motif( block1->bin, block2->bin, dist ); + + // bitblock_list_print( list ); /* DEBUG */ + + count = count_array[ motif_bin ]; + + if ( count > cutoff ) { + printf( "%zu\t%u\t%u\n", pos + j, motif_bin, count ); + } + } + + dist++; + + j++; + } + } +} + + bitblock *bitblock_new() { /* Martin A. Hansen, September 2008 */ diff --git a/code_c/Maasha/src/inc/seq.h b/code_c/Maasha/src/inc/seq.h index 2f9cc7a..21db8be 100644 --- a/code_c/Maasha/src/inc/seq.h +++ b/code_c/Maasha/src/inc/seq.h @@ -1,5 +1,9 @@ /* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */ +/* Constants for allocating memory for sequence entries. */ +#define MAX_SEQ_NAME 1024 +#define MAX_SEQ 250000000 + /* Macro to test if a given char is sequence (DNA, RNA, Protein, indels. brackets, etc ). */ #define isseq( x ) ( x > 32 && x < 127 ) ? 1 : 0 @@ -9,8 +13,12 @@ /* Macro to test if a given char is RNA. */ #define isRNA( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 -#define MAX_SEQ_NAME 1024 -#define MAX_SEQ 250000000 +/* Macros for converting DNA ASCII to binary. */ +#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */ +#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */ +#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */ +#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */ + /* Definition of a sequence entry */ struct _seq_entry @@ -22,6 +30,11 @@ struct _seq_entry typedef struct _seq_entry seq_entry; +/* Byte array for fast convertion of binary blocks to DNA. */ +/* Binary blocks holds four nucleotides encoded in 2 bits: */ +/* A=00 T=11 C=01 G=10 */ +char *bin2dna[256]; + /* Initialize a new sequence entry. */ seq_entry *seq_new( size_t max_seq_name, size_t max_seq ); diff --git a/code_c/Maasha/src/lib/seq.c b/code_c/Maasha/src/lib/seq.c index b796034..09b88d0 100644 --- a/code_c/Maasha/src/lib/seq.c +++ b/code_c/Maasha/src/lib/seq.c @@ -4,6 +4,44 @@ #include "mem.h" #include "seq.h" +/* Byte array for fast convertion of binary blocks to DNA. */ +/* Binary blocks holds four nucleotides encoded in 2 bits: */ +/* A=00 T=11 C=01 G=10 */ +char *bin2dna[256] = { + "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT", + "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT", + "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT", + "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT", + "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT", + "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT", + "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT", + "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT", + "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT", + "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT", + "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT", + "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT", + "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT", + "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT", + "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT", + "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT", + "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT", + "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT", + "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT", + "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT", + "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT", + "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT", + "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT", + "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT", + "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT", + "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT", + "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT", + "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT", + "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT", + "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT", + "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT", + "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT" +}; + seq_entry *seq_new( size_t max_seq_name, size_t max_seq ) { diff --git a/code_perl/Maasha/Calc.pm b/code_perl/Maasha/Calc.pm index 4271182..51f841f 100644 --- a/code_perl/Maasha/Calc.pm +++ b/code_perl/Maasha/Calc.pm @@ -184,6 +184,39 @@ sub median } +sub standard_deviation +{ + # Martin A. Hansen, September 2008 + + # Given a list of numbers calculate and return the standard deviation: + # http://en.wikipedia.org/wiki/Standard_deviation + + my ( $numbers, # list of numbers + ) = @_; + + # Returns a float. + + my ( $mean_num, $num, $div, $div_sum, $mean_div, $std_div ); + + $mean_num = mean( $numbers ); + + $div_sum = 0; + + foreach $num ( @{ $numbers } ) + { + $div = ( $num - $mean_num ) ** 2; + + $div_sum += $div; + } + + $mean_div = $div_sum / scalar @{ $numbers }; + + $std_div = sqrt( abs( $mean_div ) ); + + return $std_div; +} + + sub min { # Martin A. Hansen, August 2006. -- 2.39.5