/* Returns the length of the chomped string or -1 is no newline was found. */
size_t chomp( char *string );
+/* Returns the total number of a given char in a given string. */
+size_t strchr_total( const char *string, const char c );
+
/* Locate a substr in a str starting at pos allowing for a given number of mismatches. */
/* Returns position of match begin or -1 if not found. */
size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch );
/* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
-#define BED_BUFFER 1024
+#define BED_BUFFER 2048
+#define BED_CHR_MAX 64
+#define BED_QID_MAX 256
+#define BED_ITEMRGB_MAX 16
+#define BED_BLOCKSIZES_MAX 512
+#define BED_QBEGS_MAX 512
-struct bed_entry3
-{
- char *chr;
- uint chr_beg;
- uint chr_end;
-};
-
-struct bed_entry12
+struct _bed_entry
{
+ int cols;
char *chr;
uint chr_beg;
uint chr_end;
char *q_id;
- float score;
+ uint score;
char strand;
uint thick_beg;
uint thick_end;
char *q_begs;
};
-void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols );
-void bed_split( char *string, struct bed_entry12 *bed, int cols );
+typedef struct _bed_entry bed_entry;
+bed_entry *bed_entry_new( const int cols );
+bed_entry *bed_entry_get( FILE *fp, const int cols );
+list_sl *bed_entries_get( char *path, const int cols );
+void bed_entry_put( bed_entry *entry, int cols );
+void bed_entries_put( list_sl *entries, int cols );
+int cmp_bed3_entries_sort( const void *a, const void *b );
{
/* Martin A. Hansen, June 2008 */
- /* Unit test done.*/
-
/* Remove the last char from a string. */
/* Returns the length of the chopped string.*/
{
/* Martin A. Hansen, June 2008 */
- /* Unit test done.*/
-
/* Removes the last char from a string if the char is a newline. */
/* Returns the length of the chomped string or -1 is no newline was found. */
}
-size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch )
+size_t strchr_total( const char *string, const char c )
{
- /* Martin A. Hansen, August 2008. */
+ /* Martin A. Hansen, September 2008 */
+
+ /* Returns the total number of a given char in a given string. */
- /* Unit test done.*/
+ int count[ 256 ] = { 0 }; /* Integer array spanning the ASCII alphabet */
+ int i;
+
+ for ( i = 0; i < strlen( string ); i++ ) {
+ count[ ( int ) string[ i ] ]++;
+ }
+
+ return count[ ( int ) c ];
+}
+
+
+size_t match_substr( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch )
+{
+ /* Martin A. Hansen, August 2008 */
/* Locate a substr in a str starting at pos allowing for a given number of mismatches. */
/* Returns position of match begin or -1 if not found. */
size_t match_substr_rev( size_t pos, char *str, size_t str_len, char *substr, size_t substr_len, size_t mismatch )
{
- /* Martin A. Hansen, August 2008. */
-
- /* Unit test done.*/
+ /* Martin A. Hansen, August 2008 */
/* Locate a substr in a str backwards starting at the end of */
/* str minus pos allowing for a given number of mismatches. */
return -1;
}
+
+
/* Martin Asser Hansen (mail@maasha.dk) Copyright (C) 2008 - All right reserved */
+
#include "common.h"
#include "mem.h"
+#include "filesys.h"
+#include "list.h"
+#include "strings.h"
#include "ucsc.h"
-void bed_get_entry( FILE *fp, struct bed_entry3 *bed, int cols )
+bed_entry *bed_entry_new( const int cols )
+{
+ bed_entry *entry = mem_get( sizeof( bed_entry ) );
+
+ entry->cols = cols;
+ entry->chr = mem_get( BED_CHR_MAX );
+ entry->chr_beg = 0;
+ entry->chr_end = 0;
+
+ if ( cols == 3 ) {
+ return entry;
+ }
+
+ entry->q_id = mem_get( BED_QID_MAX );
+
+ if ( cols == 4 ) {
+ return entry;
+ }
+
+ entry->score = 0;
+
+ if ( cols == 5 ) {
+ return entry;
+ }
+
+ entry->strand = 0;
+
+ if ( cols == 6 ) {
+ return entry;
+ }
+
+ entry->thick_beg = 0;
+ entry->thick_end = 0;
+ entry->itemrgb = mem_get( BED_ITEMRGB_MAX );
+ entry->blockcount = 0;;
+ entry->blocksizes = mem_get( BED_BLOCKSIZES_MAX );
+ entry->q_begs = mem_get( BED_QBEGS_MAX );
+
+ return entry;
+}
+
+
+bed_entry *bed_entry_get( FILE *fp, int cols )
{
- /* Martin A. Hansen, June 2008 */
+ bed_entry *entry = bed_entry_new( cols );
+ char buffer[ BED_BUFFER ];
+
+ assert( cols == 0 || cols == 3 || cols == 4 || cols == 5 || cols == 6 || cols == 12 );
+
+ if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
+ {
+ if ( ! cols )
+ {
+ cols = 1 + strchr_total( buffer, '\t' );
+ entry->cols = cols;
+ }
+
+ if ( cols == 3 )
+ {
+ sscanf(
+ buffer,
+ "%s\t%u\t%u",
+ entry->chr,
+ &entry->chr_beg,
+ &entry->chr_end
+ );
+
+ return entry;
+ }
+
+ if ( cols == 4 )
+ {
+ sscanf(
+ buffer,
+ "%s\t%u\t%u\t%s",
+ entry->chr,
+ &entry->chr_beg,
+ &entry->chr_end,
+ entry->q_id
+ );
+
+ return entry;
+ }
+
+ if ( cols == 5 )
+ {
+ sscanf(
+ buffer,
+ "%s\t%u\t%u\t%s\t%u",
+ entry->chr,
+ &entry->chr_beg,
+ &entry->chr_end,
+ entry->q_id,
+ &entry->score
+ );
- /* Get next 3 column bed entry from stream. */
+ return entry;
+ }
- char bed_buffer[ BED_BUFFER ];
- struct bed_entry12 *bed12 = NULL;
+ if ( cols == 6 )
+ {
+ sscanf(
+ buffer,
+ "%s\t%u\t%u\t%s\t%u\t%c",
+ entry->chr,
+ &entry->chr_beg,
+ &entry->chr_end,
+ entry->q_id,
+ &entry->score,
+ &entry->strand
+ );
- bed12 = mem_get( sizeof( bed12 ) );
+ return entry;
+ }
+
+ if ( cols == 12 )
+ {
+ sscanf(
+ buffer,
+ "%s\t%u\t%u\t%s\t%u\t%c\t%u\t%u\t%s\t%u\t%s\t%s",
+ entry->chr,
+ &entry->chr_beg,
+ &entry->chr_end,
+ entry->q_id,
+ &entry->score,
+ &entry->strand,
+ &entry->thick_beg,
+ &entry->thick_end,
+ entry->itemrgb,
+ &entry->blockcount,
+ entry->blocksizes,
+ entry->q_begs
+ );
+
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+
+list_sl *bed_entries_get( char *path, const int cols )
+{
+ list_sl *list = list_sl_new();
+ node_sl *node = node_sl_new();
+ node_sl *old_node = NULL;
+ bed_entry *entry = NULL;
+ FILE *fp = NULL;
+
+ fp = read_open( path );
- if ( ( fgets( bed_buffer, sizeof( bed_buffer ), fp ) != NULL ) )
+ if ( ( entry = bed_entry_get( fp, cols ) ) != NULL )
{
- printf( "buffer: %s\n", bed_buffer );
+ node->val = entry;
- bed_split( bed_buffer, bed12, 3 );
+ list_sl_add_beg( &list, &node );
- return;
+ old_node = node;
}
-// return NULL;
+ while ( ( entry = bed_entry_get( fp, cols ) ) != NULL )
+ {
+ node = node_sl_new();
+
+ node->val = entry;
+
+ list_sl_add_after( &old_node, &node );
+
+ old_node = node;
+ }
+
+ close_stream( fp );
+
+ return list;
+}
+
+
+void bed_entry_put( bed_entry *entry, int cols )
+{
+ if ( ! cols ) {
+ cols = entry->cols;
+ }
+
+ if ( cols == 3 )
+ {
+ printf(
+ "%s\t%u\t%u\n",
+ entry->chr,
+ entry->chr_beg,
+ entry->chr_end
+ );
+ }
+ else if ( cols == 4 )
+ {
+ printf(
+ "%s\t%u\t%u\t%s\n",
+ entry->chr,
+ entry->chr_beg,
+ entry->chr_end,
+ entry->q_id
+ );
+ }
+ else if ( cols == 5 )
+ {
+ printf(
+ "%s\t%u\t%u\t%s\t%u\n",
+ entry->chr,
+ entry->chr_beg,
+ entry->chr_end,
+ entry->q_id,
+ entry->score
+ );
+ }
+ else if ( cols == 6 )
+ {
+ printf(
+ "%s\t%u\t%u\t%s\t%u\t%c\n",
+ entry->chr,
+ entry->chr_beg,
+ entry->chr_end,
+ entry->q_id,
+ entry->score,
+ entry->strand
+ );
+ }
+ else if ( cols == 12 )
+ {
+ printf(
+ "%s\t%u\t%u\t%s\t%u\t%c\t%u\t%u\t%s\t%u\t%s\t%s\n",
+ entry->chr,
+ entry->chr_beg,
+ entry->chr_end,
+ entry->q_id,
+ entry->score,
+ entry->strand,
+ entry->thick_beg,
+ entry->thick_end,
+ entry->itemrgb,
+ entry->blockcount,
+ entry->blocksizes,
+ entry->q_begs
+ );
+ }
+ else
+ {
+ fprintf( stderr, "ERROR: Wrong number of columns in bed_entry_put: %d\n", cols );
+
+ abort();
+ }
+}
+
+
+void bed_entries_put( list_sl *entries, int cols )
+{
+ node_sl *node = NULL;
+
+ for ( node = entries->first; node != NULL; node = node->next ) {
+ bed_entry_put( ( bed_entry * ) node->val, cols );
+ }
}
+int cmp_bed3_entries_sort( const void *a, const void *b )
+{
+ node_sl *a_node = *( ( node_sl ** ) a );
+ node_sl *b_node = *( ( node_sl ** ) b );
+
+ bed_entry *a_entry = ( bed_entry * ) a_node->val;
+ bed_entry *b_entry = ( bed_entry * ) b_node->val;
+
+ if ( a_entry->chr_end < b_entry->chr_end ) {
+ return 1;
+ } else if ( a_entry->chr_end > b_entry->chr_end ) {
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
--- /dev/null
+chr4 31176 31602 AA695812 0 - 31176 31602 0 1 426, 0,
+chr4 44448 44874 AA695812 0 - 44448 44874 0 1 426, 0,
+chr4 50522 50841 AA142091 0 - 50522 50841 0 2 81,237, 0,82,
+chr4 57489 57808 AA142091 0 - 57489 57808 0 2 81,237, 0,82,
+chr4 59352 59778 AA695812 0 - 59352 59778 0 1 426, 0,
+chr4 63580 64332 AA979544 0 - 63580 64332 0 1 752, 0,
+chr4 63710 64332 AA979534 0 - 63710 64332 0 3 111,481,30, 0,111,592,
+chr4 70946 71196 AA699063 0 - 70946 71196 0 2 142,55, 0,195,
+chr4 72831 76893 AA264101 0 - 72831 76893 0 2 179,437, 0,3625,
+chr4 72872 76630 AA694817 0 - 72872 76630 0 3 83,54,174, 0,84,3584,
--- /dev/null
+chr4 31176 31602
+chr4 44448 44874
+chr4 50522 50841
+chr4 57489 57808
+chr4 59352 59778
+chr4 63580 64332
+chr4 63710 64332
+chr4 70946 71196
+chr4 72831 76893
+chr4 72872 76630
--- /dev/null
+chr4 31176 31602 AA695812
+chr4 44448 44874 AA695812
+chr4 50522 50841 AA142091
+chr4 57489 57808 AA142091
+chr4 59352 59778 AA695812
+chr4 63580 64332 AA979544
+chr4 63710 64332 AA979534
+chr4 70946 71196 AA699063
+chr4 72831 76893 AA264101
+chr4 72872 76630 AA694817
--- /dev/null
+chr4 31176 31602 AA695812 0
+chr4 44448 44874 AA695812 0
+chr4 50522 50841 AA142091 0
+chr4 57489 57808 AA142091 0
+chr4 59352 59778 AA695812 0
+chr4 63580 64332 AA979544 0
+chr4 63710 64332 AA979534 0
+chr4 70946 71196 AA699063 0
+chr4 72831 76893 AA264101 0
+chr4 72872 76630 AA694817 0
--- /dev/null
+chr4 31176 31602 AA695812 0 -
+chr4 44448 44874 AA695812 0 -
+chr4 50522 50841 AA142091 0 -
+chr4 57489 57808 AA142091 0 -
+chr4 59352 59778 AA695812 0 -
+chr4 63580 64332 AA979544 0 -
+chr4 63710 64332 AA979534 0 -
+chr4 70946 71196 AA699063 0 -
+chr4 72831 76893 AA264101 0 -
+chr4 72872 76630 AA694817 0 -
static void test_chop();
static void test_chomp();
+static void test_strchr_total();
static void test_match_substr();
static void test_match_substr_rev();
test_chop();
test_chomp();
+ test_strchr_total();
test_match_substr();
test_match_substr_rev();
}
+static void test_strchr_total()
+{
+ fprintf( stderr, " Testing strchr_total ... " );
+
+ char *str = "X-----X----X";
+
+ assert( strchr_total( str, 'X' ) == 3 );
+ assert( strchr_total( str, '-' ) == 9 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
static void test_match_substr()
{
fprintf( stderr, " Testing match_substr ... " );
--- /dev/null
+#include "common.h"
+#include "filesys.h"
+#include "list.h"
+#include "ucsc.h"
+
+static void test_bed_entry_get();
+static void test_bed_entries_get();
+static void test_bed_entries_sort();
+
+
+int main()
+{
+ fprintf( stderr, "Running all tests for ucsc.c\n" );
+
+ test_bed_entry_get();
+ test_bed_entries_get();
+ test_bed_entries_sort();
+
+ fprintf( stderr, "Done\n\n" );
+
+ return EXIT_SUCCESS;
+}
+
+
+void test_bed_entry_get()
+{
+ fprintf( stderr, " Testing bed_entry_get ... " );
+
+ char *path = "test/test_files/test12.bed";
+ FILE *fp = NULL;
+ bed_entry *entry = NULL;
+
+ fp = read_open( path );
+
+ while ( ( entry = bed_entry_get( fp, 12 ) ) != NULL )
+ {
+// bed_entry_put( entry, 3 );
+ }
+
+ close_stream( fp );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_entries_get()
+{
+ fprintf( stderr, " Testing bed_entries_get ... " );
+
+ char *path = "test/test_files/test12.bed";
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, 0 );
+
+ bed_entries_put( entries, 0 );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
+void test_bed_entries_sort()
+{
+ fprintf( stderr, " Testing bed_entries_sort ... " );
+
+ char *path = "test/test_files/test12.bed";
+ list_sl *entries = NULL;
+
+ entries = bed_entries_get( path, 0 );
+
+ list_sl_sort( &entries, cmp_bed3_entries_sort );
+
+ bed_entries_put( entries, 0 );
+
+ fprintf( stderr, "OK\n" );
+}
+