10 #define BARRAY_SIZE ( 1 << 16 )
17 "bed2tag_contigs assembles overlapping BED type entries into tag\n"
18 "contigs if the clone count (assumed to be appended to the Q_ID field) of each\n"
19 "entry is higher than the times the entry was mapped to the genome (this\n"
20 "information is assumed to be the SCORE field). A tag contig score is calculated\n"
21 "as the highest density of overlapping tags (assuming that these tags are\n"
22 "uniquely mapping):\n"
24 " tags ----------- Q_ID: ID_2\n"
25 " ------------- Q_ID: ID_3\n"
26 " --------------- Q_ID: ID_1\n"
27 " ------------ Q_ID: ID_2\n"
28 " tag contig =============================\n"
29 " scores 22555566666444411333322222222\n"
31 " tag contig score = 6\n"
33 "Thus, tags are only assembled into tag contigs if their expression level is\n"
34 "higher than their repetitiveness; a tag sequenced 10 times, i.e. with a clone\n"
35 "count of 10, will only be considered if was found in the genome fewer than 10\n"
36 "times, i.e. have a SCORE below 10.\n"
38 "A forthrunning number prefixed with TC is added as Q_ID for the resulting tag\n"
41 "Usage: bed2tag_contigs < <BED file(s)> > <BED file>\n\n"
48 long get_score( char *str )
50 /* Martin A. Hansen, December 2008. */
52 /* Extract the last decimal number after _
53 * in a string and return that. If no number
54 * was found return 1. */
59 if ( ( c = strrchr( str, '_' ) ) != NULL ) {
60 score = strtol( &c[ 1 ], NULL , 10 );
67 int main( int argc, char *argv[] )
69 bed_entry *entry = NULL;
70 hash *chr_hash = NULL;
71 hash_elem *bucket = NULL;
82 entry = bed_entry_new( BED_COLS );
83 chr_hash = hash_new( HASH_SIZE );
85 if ( isatty( fileno( stdin ) ) ) {
89 while ( ( bed_entry_get( stdin, &entry ) ) )
91 // bed_entry_put( entry, entry->cols );
93 ba = ( barray * ) hash_get( chr_hash, entry->chr );
97 ba = barray_new( BARRAY_SIZE );
99 hash_add( chr_hash, entry->chr, ba );
102 score = ( uint ) get_score( entry->q_id );
104 if ( score > entry->score && entry->score > 0 ) {
105 barray_interval_inc( ba, entry->chr_beg, entry->chr_end - 1, ( uint ) ( score / entry->score ) );
109 // barray_print( ba );
111 for ( i = 0; i < chr_hash->table_size; i++ )
113 for ( bucket = chr_hash->table[ i ]; bucket != NULL; bucket = bucket->next )
116 ba = ( barray * ) bucket->val;
120 while ( barray_interval_scan( ba, &pos, &beg, &end ) )
122 max = barray_interval_max( ba, beg, end );
124 // printf( "chr: %s pos: %zu beg: %zu end: %zu max: %hd\n", chr, pos, beg, end, max );
126 printf( "%s\t%zu\t%zu\t%s_%08zu\t%u\n", chr, beg, end + 1, chr, id, ( uint ) max );