#include "common.h"
#include "filesys.h"
+#include "seq.h"
#define BITS_IN_BYTE 8 /* number of bits in one byte. */
#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
#define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */
-/* Byte array for fast convertion of binary blocks back to DNA. */
-char *bin2dna[256] = {
- "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
- "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
- "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
- "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
- "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
- "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
- "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
- "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
- "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
- "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
- "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
- "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
- "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
- "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
- "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
- "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
- "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
- "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
- "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
- "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
- "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
- "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
- "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
- "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
- "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
- "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
- "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
- "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
- "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
- "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
- "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
- "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
-};
-
-
/* Function declarations. */
void run_decode( int argc, char *argv[] );
void print_usage();
#define BLOCK_SPACE_MAX 64 /* maximum space between two blocks. */
#define BLOCK_MASK ( ( BLOCK_SPACE_MAX << 1 ) - 1 ) /* mask for printing block space. */
#define COUNT_ARRAY_NMEMB ( 1 << 30 ) /* number of objects in the unsigned int count array. */
-#define CUTOFF 10000 /* minimum number of motifs in output. */
+#define CUTOFF 1 /* minimum number of motifs in output. */
-#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
-#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
-#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
-#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
-
/* Structure that will hold one tetra nucleotide block. */
struct _bitblock
{
void run_scan( int argc, char *argv[] );
void print_usage();
void scan_file( char *file, seq_entry *entry, uint *count_array );
+void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff );
uint *count_array_new( size_t nmemb );
void scan_seq( char *seq, size_t seq_len, uint *count_array );
+void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff );
void scan_list( list_sl *list, uint *count_array );
+void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff );
bitblock *bitblock_new();
uint blocks2motif( uchar bin1, uchar bin2, ushort dist );
void count_array_print( uint *count_array, size_t nmemb, size_t cutoff );
/* Print usage and exit if no files in argument. */
- fprintf( stderr,
- "Usage: bipartite_scam <FASTA file(s)> > result.csv\n"
- );
+ fprintf( stderr, "Usage: bipartite_scam <FASTA file(s)> > result.csv\n" );
exit( EXIT_SUCCESS );
}
count_array_print( count_array, COUNT_ARRAY_NMEMB, CUTOFF );
fprintf( stderr, "done.\n" );
+ file = argv[ 1 ];
+
+ fprintf( stderr, "Rescanning file: %s\n", file );
+ rescan_file( file, entry, count_array, CUTOFF );
+ fprintf( stderr, "done.\n" );
+
seq_destroy( entry );
mem_free( &count_array );
{
/* Martin A. Hansen, September 2008 */
- /* Scan all FASTA entries of a file in both */
- /* sense and anti-sense directions. */
+ /* Scan all FASTA entries of a file. */
FILE *fp = read_open( file );
}
+void rescan_file( char *file, seq_entry *entry, uint *count_array, size_t cutoff )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Rescan all FASTA entries of a file. */
+
+ FILE *fp = read_open( file );
+
+ while ( fasta_get_entry( fp, &entry ) == TRUE )
+ {
+ fprintf( stderr, " Rescanning: %s (%zu nt) ... ", entry->seq_name, entry->seq_len );
+
+ rescan_seq( entry->seq, entry->seq_len, count_array, cutoff );
+
+ fprintf( stderr, "done.\n" );
+ }
+
+ close_stream( fp );
+}
+
+
void scan_seq( char *seq, size_t seq_len, uint *count_array )
{
/* Martin A. Hansen, September 2008 */
}
+void rescan_seq( char *seq, size_t seq_len, uint *count_array, size_t cutoff )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Run a sliding window over a given sequence. The window */
+ /* consists of a list where new blocks of 4 nucleotides */
+ /* are pushed onto one end while at the same time old */
+ /* blocks are popped from the other end. The number of */
+ /* in the list is determined by the maximum seperator. */
+ /* Everytime we have a full window, the window is scanned */
+ /* for motifs. */
+
+ bitblock *block = NULL;
+ size_t b_count = 0;
+ ushort n_count = 0;
+ size_t i = 0;
+ uchar bin = 0;
+ bool first_node = TRUE;
+ node_sl *new_node = NULL;
+ node_sl *old_node = NULL;
+ list_sl *list = list_sl_new();
+
+ for ( i = 0; seq[ i ]; i++ )
+ {
+ bin <<= BITS_IN_NT;
+
+ switch( seq[ i ] )
+ {
+ case 'A': case 'a': add_A( bin ); break;
+ case 'T': case 't': add_T( bin ); break;
+ case 'C': case 'c': add_C( bin ); break;
+ case 'G': case 'g': add_G( bin ); break;
+ default: n_count = BLOCK_SIZE_NT; break;
+ }
+
+ if ( i > BLOCK_SIZE_NT - 2 )
+ {
+ b_count++;
+
+ block = bitblock_new();
+ block->bin = bin;
+
+ if ( n_count > 0 )
+ {
+ block->hasN = TRUE;
+ n_count--;
+ }
+
+ new_node = node_sl_new();
+ new_node->val = block;
+
+ if ( first_node ) {
+ list_sl_add_beg( &list, &new_node );
+ } else {
+ list_sl_add_after( &old_node, &new_node );
+ }
+
+ old_node = new_node;
+
+ first_node = FALSE;
+
+ if ( b_count > BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
+ {
+ // bitblock_list_print( list ); /* DEBUG */
+
+ rescan_list( list, count_array, i, cutoff );
+
+ list_sl_remove_beg( &list );
+ }
+ }
+ }
+
+ /* if the list is shorter than BLOCK_SPACE_MAX + BLOCK_SIZE_NT */
+ if ( b_count <= BLOCK_SPACE_MAX + BLOCK_SIZE_NT )
+ {
+ // bitblock_list_print( list ); /* DEBUG */
+
+ rescan_list( list, count_array, i, cutoff );
+ }
+
+ list_sl_destroy( &list );
+}
+
+
void scan_list( list_sl *list, uint *count_array )
{
/* Martin A. Hansen, September 2008 */
}
+void rescan_list( list_sl *list, uint *count_array, size_t pos, size_t cutoff )
+{
+ /* Martin A. Hansen, September 2008 */
+
+ /* Scan a list of blocks for biparite motifs by creating */
+ /* a binary motif consisting of two blocks of 4 nucleotides */
+ /* along with the distance separating them. Motifs containing */
+ /* N's are skipped. */
+
+ node_sl *first_node = NULL;
+ node_sl *next_node = NULL;
+ bitblock *block1 = NULL;
+ bitblock *block2 = NULL;
+ int i = 0;
+ ushort dist = 0;
+ uint motif_bin = 0;
+ uint j = 0;
+ uint count = 0;
+
+// bitblock_list_print( list );
+
+ first_node = list->first;
+
+ block1 = ( bitblock * ) first_node->val;
+
+ if ( ! block1->hasN )
+ {
+ next_node = first_node->next;
+
+ for ( i = 0; i < BLOCK_SIZE_NT - 1; i++ )
+ {
+ next_node = next_node->next;
+
+ j++;
+ }
+
+ for ( next_node = next_node; next_node != NULL; next_node = next_node->next )
+ {
+ block2 = ( bitblock * ) next_node->val;
+
+ if ( ! block2->hasN )
+ {
+ motif_bin = blocks2motif( block1->bin, block2->bin, dist );
+
+ // bitblock_list_print( list ); /* DEBUG */
+
+ count = count_array[ motif_bin ];
+
+ if ( count > cutoff ) {
+ printf( "%zu\t%u\t%u\n", pos + j, motif_bin, count );
+ }
+ }
+
+ dist++;
+
+ j++;
+ }
+ }
+}
+
+
bitblock *bitblock_new()
{
/* Martin A. Hansen, September 2008 */
/* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
+/* Constants for allocating memory for sequence entries. */
+#define MAX_SEQ_NAME 1024
+#define MAX_SEQ 250000000
+
/* Macro to test if a given char is sequence (DNA, RNA, Protein, indels. brackets, etc ). */
#define isseq( x ) ( x > 32 && x < 127 ) ? 1 : 0
/* Macro to test if a given char is RNA. */
#define isRNA( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0
-#define MAX_SEQ_NAME 1024
-#define MAX_SEQ 250000000
+/* Macros for converting DNA ASCII to binary. */
+#define add_A( c ) /* add 00 to the rightmost two bits of bin (i.e. do nothing). */
+#define add_T( c ) ( c |= 3 ) /* add 11 on the rightmost two bits of c. */
+#define add_C( c ) ( c |= 1 ) /* add 01 on the rightmost two bits of c. */
+#define add_G( c ) ( c |= 2 ) /* add 10 on the rightmost two bits of c. */
+
/* Definition of a sequence entry */
struct _seq_entry
typedef struct _seq_entry seq_entry;
+/* Byte array for fast convertion of binary blocks to DNA. */
+/* Binary blocks holds four nucleotides encoded in 2 bits: */
+/* A=00 T=11 C=01 G=10 */
+char *bin2dna[256];
+
/* Initialize a new sequence entry. */
seq_entry *seq_new( size_t max_seq_name, size_t max_seq );
#include "mem.h"
#include "seq.h"
+/* Byte array for fast convertion of binary blocks to DNA. */
+/* Binary blocks holds four nucleotides encoded in 2 bits: */
+/* A=00 T=11 C=01 G=10 */
+char *bin2dna[256] = {
+ "AAAA", "AAAC", "AAAG", "AAAT", "AACA", "AACC", "AACG", "AACT",
+ "AAGA", "AAGC", "AAGG", "AAGT", "AATA", "AATC", "AATG", "AATT",
+ "ACAA", "ACAC", "ACAG", "ACAT", "ACCA", "ACCC", "ACCG", "ACCT",
+ "ACGA", "ACGC", "ACGG", "ACGT", "ACTA", "ACTC", "ACTG", "ACTT",
+ "AGAA", "AGAC", "AGAG", "AGAT", "AGCA", "AGCC", "AGCG", "AGCT",
+ "AGGA", "AGGC", "AGGG", "AGGT", "AGTA", "AGTC", "AGTG", "AGTT",
+ "ATAA", "ATAC", "ATAG", "ATAT", "ATCA", "ATCC", "ATCG", "ATCT",
+ "ATGA", "ATGC", "ATGG", "ATGT", "ATTA", "ATTC", "ATTG", "ATTT",
+ "CAAA", "CAAC", "CAAG", "CAAT", "CACA", "CACC", "CACG", "CACT",
+ "CAGA", "CAGC", "CAGG", "CAGT", "CATA", "CATC", "CATG", "CATT",
+ "CCAA", "CCAC", "CCAG", "CCAT", "CCCA", "CCCC", "CCCG", "CCCT",
+ "CCGA", "CCGC", "CCGG", "CCGT", "CCTA", "CCTC", "CCTG", "CCTT",
+ "CGAA", "CGAC", "CGAG", "CGAT", "CGCA", "CGCC", "CGCG", "CGCT",
+ "CGGA", "CGGC", "CGGG", "CGGT", "CGTA", "CGTC", "CGTG", "CGTT",
+ "CTAA", "CTAC", "CTAG", "CTAT", "CTCA", "CTCC", "CTCG", "CTCT",
+ "CTGA", "CTGC", "CTGG", "CTGT", "CTTA", "CTTC", "CTTG", "CTTT",
+ "GAAA", "GAAC", "GAAG", "GAAT", "GACA", "GACC", "GACG", "GACT",
+ "GAGA", "GAGC", "GAGG", "GAGT", "GATA", "GATC", "GATG", "GATT",
+ "GCAA", "GCAC", "GCAG", "GCAT", "GCCA", "GCCC", "GCCG", "GCCT",
+ "GCGA", "GCGC", "GCGG", "GCGT", "GCTA", "GCTC", "GCTG", "GCTT",
+ "GGAA", "GGAC", "GGAG", "GGAT", "GGCA", "GGCC", "GGCG", "GGCT",
+ "GGGA", "GGGC", "GGGG", "GGGT", "GGTA", "GGTC", "GGTG", "GGTT",
+ "GTAA", "GTAC", "GTAG", "GTAT", "GTCA", "GTCC", "GTCG", "GTCT",
+ "GTGA", "GTGC", "GTGG", "GTGT", "GTTA", "GTTC", "GTTG", "GTTT",
+ "TAAA", "TAAC", "TAAG", "TAAT", "TACA", "TACC", "TACG", "TACT",
+ "TAGA", "TAGC", "TAGG", "TAGT", "TATA", "TATC", "TATG", "TATT",
+ "TCAA", "TCAC", "TCAG", "TCAT", "TCCA", "TCCC", "TCCG", "TCCT",
+ "TCGA", "TCGC", "TCGG", "TCGT", "TCTA", "TCTC", "TCTG", "TCTT",
+ "TGAA", "TGAC", "TGAG", "TGAT", "TGCA", "TGCC", "TGCG", "TGCT",
+ "TGGA", "TGGC", "TGGG", "TGGT", "TGTA", "TGTC", "TGTG", "TGTT",
+ "TTAA", "TTAC", "TTAG", "TTAT", "TTCA", "TTCC", "TTCG", "TTCT",
+ "TTGA", "TTGC", "TTGG", "TTGT", "TTTA", "TTTC", "TTTG", "TTTT"
+};
+
seq_entry *seq_new( size_t max_seq_name, size_t max_seq )
{
}
+sub standard_deviation
+{
+ # Martin A. Hansen, September 2008
+
+ # Given a list of numbers calculate and return the standard deviation:
+ # http://en.wikipedia.org/wiki/Standard_deviation
+
+ my ( $numbers, # list of numbers
+ ) = @_;
+
+ # Returns a float.
+
+ my ( $mean_num, $num, $div, $div_sum, $mean_div, $std_div );
+
+ $mean_num = mean( $numbers );
+
+ $div_sum = 0;
+
+ foreach $num ( @{ $numbers } )
+ {
+ $div = ( $num - $mean_num ) ** 2;
+
+ $div_sum += $div;
+ }
+
+ $mean_div = $div_sum / scalar @{ $numbers };
+
+ $std_div = sqrt( abs( $mean_div ) );
+
+ return $std_div;
+}
+
+
sub min
{
# Martin A. Hansen, August 2006.