INC = -I $(INC_DIR)
LIB = -lm $(LIB_DIR)*.o
-all: libs utest bed2fixedstep bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic
+all: libs utest bed2fixedstep bed2tag_contigs bed_sort bipartite_scan bipartite_decode fasta_count repeat-O-matic
libs:
cd $(LIB_DIR) && ${MAKE} all
bed2fixedstep: bed2fixedstep.c
$(CC) $(Cflags) $(INC) $(LIB) bed2fixedstep.c -o bed2fixedstep
+bed2tag_contigs: bed2tag_contigs.c
+ $(CC) $(Cflags) $(INC) $(LIB) bed2tag_contigs.c -o bed2tag_contigs
+
bed_sort: bed_sort.c
$(CC) $(Cflags) $(INC) $(LIB) bed_sort.c -o bed_sort
cd $(LIB_DIR) && ${MAKE} clean
cd $(TEST_DIR) && ${MAKE} clean
rm bed2fixedstep
+ rm bed2tag_contigs
rm bed_sort
rm bipartite_scan
rm bipartite_decode
--- /dev/null
+#include "common.h"
+#include "mem.h"
+#include "list.h"
+#include "ucsc.h"
+#include "hash.h"
+#include "barray.h"
+
+#define BED_COLS 5
+#define HASH_SIZE 8
+#define BARRAY_SIZE ( 1 << 16 )
+
+
+static void usage()
+{
+ fprintf( stderr,
+ "\n"
+ "bed2tag_contigs assembles overlapping BED type entries into tag\n"
+ "contigs if the clone count (assumed to be appended to the Q_ID field) of each\n"
+ "entry is higher than the times the entry was mapped to the genome (this\n"
+ "information is assumed to be the SCORE field). A tag contig score is calculated\n"
+ "as the highest density of overlapping tags (assuming that these tags are\n"
+ "uniquely mapping):\n"
+ "\n"
+ " tags ----------- Q_ID: ID_2\n"
+ " ------------- Q_ID: ID_3\n"
+ " --------------- Q_ID: ID_1\n"
+ " ------------ Q_ID: ID_2\n"
+ " tag contig =============================\n"
+ " scores 22555566666444411333322222222\n"
+ "\n"
+ " tag contig score = 6\n"
+ "\n"
+ "Thus, tags are only assembled into tag contigs if their expression level is\n"
+ "higher than their repetitiveness; a tag sequenced 10 times, i.e. with a clone\n"
+ "count of 10, will only be considered if was found in the genome fewer than 10\n"
+ "times, i.e. have a SCORE below 10.\n"
+ "\n"
+ "A forthrunning number prefixed with TC is added as Q_ID for the resulting tag\n"
+ "contigs.\n"
+ "\n"
+ "Usage: bed2tag_contigs < <BED file(s)> > <BED file>\n\n"
+ );
+
+ exit( EXIT_FAILURE );
+}
+
+
+long get_score( char *str )
+{
+ /* Martin A. Hansen, December 2008. */
+
+ /* Extract the last decimal number after _
+ * in a string and return that. If no number
+ * was found return 1. */
+
+ char *c;
+ long score = 1;
+
+ if ( ( c = strrchr( str, '_' ) ) != NULL ) {
+ score = strtol( &c[ 1 ], NULL , 10 );
+ }
+
+ return score;
+}
+
+
+int main( int argc, char *argv[] )
+{
+ bed_entry *entry = NULL;
+ hash *chr_hash = NULL;
+ hash_elem *bucket = NULL;
+ barray *ba = NULL;
+ ushort score = 0;
+ size_t i = 0;
+ char *chr = NULL;
+ size_t beg = 0;
+ size_t end = 0;
+ size_t pos = 0;
+ ushort max = 0;
+ size_t id = 0;
+
+ entry = bed_entry_new( BED_COLS );
+ chr_hash = hash_new( HASH_SIZE );
+
+ if ( isatty( fileno( stdin ) ) ) {
+ usage();
+ }
+
+ while ( ( bed_entry_get( stdin, &entry ) ) )
+ {
+// bed_entry_put( entry, entry->cols );
+
+ ba = ( barray * ) hash_get( chr_hash, entry->chr );
+
+ if ( ba == NULL )
+ {
+ ba = barray_new( BARRAY_SIZE );
+
+ hash_add( chr_hash, entry->chr, ba );
+ }
+
+ score = ( ushort ) get_score( entry->q_id );
+
+ if ( score > entry->score ) {
+ barray_interval_inc( ba, entry->chr_beg, entry->chr_end - 1, ( short ) score / entry->score );
+ }
+ }
+
+// barray_print( ba );
+
+ for ( i = 0; i < chr_hash->table_size; i++ )
+ {
+ for ( bucket = chr_hash->table[ i ]; bucket != NULL; bucket = bucket->next )
+ {
+ chr = bucket->key;
+ ba = ( barray * ) bucket->val;
+
+ pos = 0;
+
+ while ( barray_interval_scan( ba, &pos, &beg, &end ) )
+ {
+ max = barray_interval_max( ba, beg, end );
+
+// printf( "chr: %s pos: %zu beg: %zu end: %zu max: %hd\n", chr, pos, beg, end, max );
+
+ printf( "%s\t%zu\t%zu\tID_%08zu\t%hd\n", chr, beg, end + 1, id, max );
+
+ id++;
+ }
+ }
+ }
+
+ return EXIT_SUCCESS;
+}
* for the next interval of non-zero values. */
bool barray_interval_scan( barray *ba, size_t *pos_pt, size_t *beg_pt, size_t *end_pt );
+/* Locate the max value in an interval within a byte array. */
+ushort barray_interval_max( barray *ba, size_t beg, size_t end );
+
/* Deallocates a byte array and set it to NULL. */
void barray_destroy( barray **ba_ppt );
int i;
- printf( "\nnmemb: %zu end: %zu\n", ba->nmemb, ba->end );
+ printf( "\nba->nmemb: %zu ba->end: %zu\n", ba->nmemb, ba->end );
for ( i = 0; i < ba->nmemb; i++ ) {
printf( "%d: %d\n", i, ba->array[ i ] );
assert( beg >= 0 );
assert( end >= 0 );
assert( end >= beg );
+ assert( score > 0 );
if ( end > ba->nmemb ) {
barray_resize( ba, barray_new_size( end ) );
size_t beg = *beg_pt;
size_t end = *end_pt;
- if ( pos > ba->end ) {
+ if ( pos > ba->end || ba->end == 0 ) {
return FALSE;
}
}
+ushort barray_interval_max( barray *ba, size_t beg, size_t end )
+{
+ /* Martin A. Hansen, December 2008. */
+
+ /* Locate the max value in an interval within a byte array. */
+
+ size_t i = 0;
+ ushort max = 0;
+
+ for ( i = beg; i <= end; i++ ) {
+ max = MAX( max, ba->array[ i ] );
+ }
+
+ return max;
+}
+
+
void barray_destroy( barray **ba_ppt )
{
/* Martin A. Hansen, November 2008. */
static void test_barray_print();
static void test_barray_interval_inc();
static void test_barray_interval_scan();
+static void test_barray_interval_max();
static void test_barray_destroy();
test_barray_print();
test_barray_interval_inc();
test_barray_interval_scan();
+ test_barray_interval_max();
test_barray_destroy();
fprintf( stderr, "Done\n\n" );
ba = barray_new( nmemb );
+/*
barray_interval_inc( ba, 1, 2, 1 );
barray_interval_inc( ba, 4, 5, 1 );
-/*
barray_interval_inc( ba, 0, 0, 3 );
barray_interval_inc( ba, 0, 3, 3 );
barray_interval_inc( ba, 9, 9, 3 );
barray_interval_inc( ba, 25, 35, 2 );
*/
+// barray_print( ba );
+
while ( barray_interval_scan( ba, &pos, &beg, &end ) ) {
// printf( "beg: %zu end: %zu\n", beg, end );
}
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_barray_interval_max()
+{
+ fprintf( stderr, " Testing barray_interval_max ... " );
+
+ size_t nmemb = 100;
+ size_t pos = 0;
+ size_t beg = 0;
+ size_t end = 0;
+ ushort max = 0;
+ barray *ba = NULL;
+
+ ba = barray_new( nmemb );
+
+ barray_interval_inc( ba, 1, 2, 1 );
+ barray_interval_inc( ba, 4, 5, 1 );
+ barray_interval_inc( ba, 0, 0, 3 );
+ barray_interval_inc( ba, 0, 3, 3 );
+ barray_interval_inc( ba, 9, 9, 3 );
+ barray_interval_inc( ba, 11, 11, 3 );
+ barray_interval_inc( ba, 19, 29, 3 );
+ barray_interval_inc( ba, 25, 35, 2 );
+ barray_interval_inc( ba, 35, 35, 10 );
+
+ while ( barray_interval_scan( ba, &pos, &beg, &end ) )
+ {
+ max = barray_interval_max( ba, beg, end );
+
+// printf( "beg: %zu end: %zu max: %hd\n", beg, end, max );
+ }
+
// barray_print( ba );
fprintf( stderr, "OK\n" );