From: martinahansen Date: Thu, 21 Aug 2008 07:50:28 +0000 (+0000) Subject: finished c fasta parser X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=688d36b699fba5fef0aff270babd4f72e0303a38;p=biopieces.git finished c fasta parser git-svn-id: http://biopieces.googlecode.com/svn/trunk@216 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/gmon.out b/code_c/Maasha/src/gmon.out index 330a6bc..7cfc958 100644 Binary files a/code_c/Maasha/src/gmon.out and b/code_c/Maasha/src/gmon.out differ diff --git a/code_c/Maasha/src/inc/fasta.h b/code_c/Maasha/src/inc/fasta.h index 6d7c5c4..98f9272 100644 --- a/code_c/Maasha/src/inc/fasta.h +++ b/code_c/Maasha/src/inc/fasta.h @@ -1,26 +1,10 @@ -#define MAX_SEQ_NAME 1024 -#define FASTA_BUFFER 256 * 1024 - -#define isseq( x ) ( x > 32 && x < 127 ) ? TRUE : FALSE - -/* Structure of a sequence entry. */ -struct _seq_entry -{ - char *seq_name; - char *seq; - size_t seq_len; -}; - -typedef struct _seq_entry seq_entry; +#define FASTA_BUFFER 1024 /* Count all entries in a FASTA file given a file pointer. */ size_t fasta_count( FILE *fp ); -/* Get the next FASTA sequence name from a stream. */ -bool fasta_get_seq_name( FILE *fp, char **seq_name_ppt ); - /* Get next sequence entry from a FASTA file given a file pointer. */ -bool fasta_get_entry( file_buffer **buffer_ppt, seq_entry **entry ); +bool fasta_get_entry( FILE *fp, seq_entry **entry_ppt ); /* Output a sequence entry in FASTA format. */ void fasta_put_entry( seq_entry *entry ); @@ -30,6 +14,3 @@ void fasta_get_entries( FILE *fp, struct list **entries ); /* Output all sequence entries from a list in FASTA format. */ void fasta_put_entries( struct list *entries ); - -/* Deallocates memory from a seq_entry. */ -void fasta_free_entry( seq_entry *entry ); diff --git a/code_c/Maasha/src/inc/seq.h b/code_c/Maasha/src/inc/seq.h index ef7e032..43b6563 100644 --- a/code_c/Maasha/src/inc/seq.h +++ b/code_c/Maasha/src/inc/seq.h @@ -1,8 +1,30 @@ +/* Macro to test if a given char is sequence (DNA, RNA, Protein, indels. brackets, etc ). */ +#define isseq( x ) ( x > 32 && x < 127 ) ? 1 : 0 + /* Macro to test if a given char is DNA. */ -#define dna_clean( c ) ( c == 'A' || c == 'a' || c == 'T' || c == 't' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 +#define isDNA( c ) ( c == 'A' || c == 'a' || c == 'T' || c == 't' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 /* Macro to test if a given char is RNA. */ -#define rna_clean( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 +#define isRNA( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0 + +#define MAX_SEQ_NAME 1024 +#define MAX_SEQ 250000000 + +/* Definition of a sequence entry */ +struct _seq_entry +{ + char *seq_name; + char *seq; + size_t seq_len; +}; + +typedef struct _seq_entry seq_entry; + +/* Initialize a new sequence entry. */ +void seq_new( seq_entry **entry_ppt, size_t max_seq_name, size_t max_seq ); + +/* Destroy a sequence entry. */ +void seq_destroy( seq_entry *entry ); /* Uppercase sequence. */ void uppercase_seq( char *seq ); @@ -11,43 +33,43 @@ void uppercase_seq( char *seq ); void lowercase_seq( char *seq ); /* Reverse compliments DNA sequence. */ -void revcomp_dna( char *seq ); +void revcomp_dna( char *seq ); /* Reverse compliments RNA sequence. */ -void revcomp_rna( char *seq ); +void revcomp_rna( char *seq ); /* Reverse compliment nucleotide sequnce after guessing the sequence type. */ -void revcomp_nuc( char *seq ); +void revcomp_nuc( char *seq ); /* Complement DNA sequence. (NB it is not reversed!). */ -void complement_dna( char *seq ); +void complement_dna( char *seq ); /* Complement RNA sequence. (NB it is not reversed!). */ -void complement_rna( char *seq ); +void complement_rna( char *seq ); /* Complement nucleotide sequence after guessing the sequence type. */ -void complement_nuc( char *seq ); +void complement_nuc( char *seq ); /* Reverse sequence. */ -void reverse( char *seq ); +void reverse( char *seq ); /* Convert all non-nucleotide letters to Ns. */ -void seq2nuc_simple( char *seq ); +void seq2nuc_simple( char *seq ); /* Convert DNA into RNA by change t and T to u and U, respectively. */ -void dna2rna( char *seq ); +void dna2rna( char *seq ); /* Convert RNA into DNA by change u and U to t and T, respectively. */ -void rna2dna( char *seq ); +void rna2dna( char *seq ); /* Check if a sequence is DNA by inspecting the first 100 residues. */ -bool is_dna( char *seq ); +bool is_dna( char *seq ); /* Check if a sequence is RNA by inspecting the first 100 residues. */ -bool is_rna( char *seq ); +bool is_rna( char *seq ); /* Check if a sequence is protein by inspecting the first 100 residues. */ -bool is_protein( char *seq ); +bool is_protein( char *seq ); /* Guess if a sequence is DNA, RNA, or protein by inspecting the first 100 residues. */ char *seq_guess_type( char *seq ); @@ -56,4 +78,5 @@ char *seq_guess_type( char *seq ); bool contain_N( char *seq ); /* Pack a nucleotide oligo (max length 15) into a binary/integer (good for hash keys). */ -int oligo2bin( char *oligo ); +int oligo2bin( char *oligo ); + diff --git a/code_c/Maasha/src/lib/fasta.c b/code_c/Maasha/src/lib/fasta.c index 493cbe7..3f24693 100644 --- a/code_c/Maasha/src/lib/fasta.c +++ b/code_c/Maasha/src/lib/fasta.c @@ -1,8 +1,10 @@ #include "common.h" #include "mem.h" #include "filesys.h" +#include "seq.h" #include "fasta.h" #include "list.h" +#include "strings.h" size_t fasta_count( FILE *fp ) @@ -27,269 +29,57 @@ size_t fasta_count( FILE *fp ) } -bool fasta_get_entry( file_buffer **buffer_ppt, seq_entry **entry_ppt ) +bool fasta_get_entry( FILE *fp, seq_entry **entry_ppt ) { /* Martin A. Hansen, August 2008 */ - /* Get next sequence entry from a FASTA file given a file buffer. */ + /* Get next sequence entry from a FASTA file given a file pointer. */ - file_buffer *buffer = *buffer_ppt; - char *line = NULL; - size_t seq_name_len = 0; - size_t seq_len = 0; - char *seq_name = NULL; - char *seq = NULL; - size_t i = 0; + seq_entry *entry = *entry_ppt; - while ( 1 ) - { - if ( ( line = buffer_gets( buffer ) ) != NULL ) - { - if ( line[ 0 ] == '>' ) - { - seq_name_len = buffer->token_len - 2; - seq_name = mem_get( seq_name_len + 1 ); - - memcpy( seq_name, &line[ 1 ], seq_name_len ); - - break; - } - } - else - { - return FALSE; - } - } - -// printf( "SEQ_NAME: ->%s<-\n", seq_name ); - - seq = mem_get( 1 ); + entry->seq_len = 0; - seq_len = 0; + char c; + char *pt; - while ( 1 ) - { - if ( ( line = buffer_gets( buffer ) ) != NULL ) - { - if ( line[ 0 ] == '>' ) - { - buffer_ungets( buffer ); - - break; - } - else - { -// mem_resize( seq, seq_len + buffer->token_len ); - - for ( i = 0; line[ i ]; i++ ) - { - if ( isseq( line[ i ] ) ) - { - seq[ seq_len ] = line[ i ]; - - seq_len++; - } - } - } - } - else - { - break; - } + while ( ( c = fgetc( fp ) ) && c != '>' && c != EOF ) { } - if ( seq_len == 0 ) - { + if ( ( ferror( fp ) != 0 ) ) { + fprintf( stderr, "ERROR: Could not read from file: %s\n", strerror( errno ) ); + } else if ( ( feof( fp ) != 0 ) ) { return FALSE; } -// seq = mem_resize( seq, seq_len ); - - seq[ seq_len ] = '\0'; - -// printf( "SEQ: ->%s<-\n", seq ); - - ( *entry_ppt )->seq_name = seq_name; - ( *entry_ppt )->seq = seq; - ( *entry_ppt )->seq_len = seq_len; - *buffer_ppt = buffer; + pt = entry->seq_name; - return TRUE; -} - - -bool fasta_get_seq_name( FILE *fp, char **seq_name_ppt ) -{ - /* Martin A. Hansen, August 2008 */ + while ( ( c = fgetc( fp ) ) && c != '\n' ) { + *( pt++ ) = c; + } - /* Get the next FASTA sequence name from a stream. */ + *( pt ) = '\0'; - char *pt; - char buffer[ MAX_SEQ_NAME ]; + pt = entry->seq; - while ( 1 ) + while ( ( c = fgetc( fp ) ) && c != '>' && c != EOF ) { - if ( ( pt = fgets( buffer, MAX_SEQ_NAME, fp ) ) ) + if ( isseq( c ) ) { - printf( "buffer: %s\n", buffer ); - - if ( buffer[ 0 ] == '>' ) - { - *seq_name_ppt = &buffer[ 1 ]; - - return TRUE; - } + *( pt++ ) = c; + entry->seq_len++; } } - return FALSE; -} + *( pt ) = '\0'; + if ( c == '>' ) { + ungetc( c, fp ); + } -//bool fasta_get_entry( FILE *fp, seq_entry **entry ) -//{ -// /* Martin A. Hansen, May 2008 */ -// -// /* Unit test done.*/ -// -// /* Get next sequence entry from a FASTA file given a file pointer. */ -// -// size_t i; -// size_t offset; -// size_t seq_buffer_len; -// size_t seq_len; -// size_t buffer_read; -// char buffer[ FASTA_BUFFER ]; -// size_t buffer_len; -// char *seq_name = NULL; -// char *seq = NULL; -// -// offset = ftell( fp ); -// -// /* ---- Skip ahead until header line and include header ---- */ -// -// while ( 1 ) -// { -// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) -// { -// buffer_len = strlen( buffer ); -// -// offset += buffer_len; -// -// if ( ( buffer[ 0 ] == '>' ) ) -// { -// seq_name = mem_get( buffer_len - 1 ); -// -// memcpy( seq_name, &buffer[ 1 ], buffer_len - 2 ); -// -// seq_name[ buffer_len - 2 ] = '\0'; -// -// break; -// } -// } -// else -// { -// if ( ferror( fp ) != 0 ) -// { -// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) ); -// abort(); -// } -// else if ( feof( fp ) != 0 ) -// { -// return FALSE; -// } -// } -// } -// -// /* ---- Determine approximate length of sequence ---- */ -// -// seq_buffer_len = 0; -// -// while ( 1 ) -// { -// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) -// { -// if ( ( buffer[ 0 ] == '>' ) ) -// { -// assert( seq_buffer_len != 0 ); -// -// break; -// } -// else -// { -// seq_buffer_len += strlen( buffer ); -// } -// } -// else -// { -// if ( ferror( fp ) != 0 ) -// { -// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) ); -// abort(); -// } -// else if ( feof( fp ) != 0 ) -// { -// break; -// } -// } -// } -// -// /* ---- Allocate approximate memory for sequence ---- */ -// -// seq = mem_get( seq_buffer_len + 1 ); -// -// /* ---- Rewind file pointer and read sequence ---- */ -// -// if ( fseek( fp, offset, SEEK_SET ) != 0 ) -// { -// fprintf( stderr, "ERROR: fseek SEEK_SET failed: %s\n", strerror( errno ) ); -// abort(); -// } -// -// buffer_read = 0; -// seq_len = 0; -// -// while ( buffer_read < seq_buffer_len ) -// { -// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL ) -// { -// for ( i = 0; buffer[ i ]; i++ ) -// { -// if ( buffer[ i ] > 32 && buffer[ i ] < 127 ) -// { -// seq[ seq_len ] = buffer[ i ]; -// -// seq_len++; -// } -// } -// -// buffer_read += i; -// } -// else -// { -// if ( ferror( fp ) != 0 ) -// { -// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) ); -// abort(); -// } -// else if ( feof( fp ) != 0 ) -// { -// fprintf( stderr, "ERROR: get_fasta_seq failed: EOF\n" ); -// abort(); -// } -// } -// } -// -//// seq = mem_resize( seq, seq_len + 1 ); -// -// seq[ seq_len + 1 ] = '\0'; -// -// ( *entry )->seq_name = seq_name; -// ( *entry )->seq = seq; -// ( *entry )->seq_len = seq_len; -// -// return TRUE; -//} + *entry_ppt = entry; + + return TRUE; +} void fasta_put_entry( seq_entry *entry ) @@ -337,17 +127,3 @@ void fasta_put_entries( struct list *entries ) fasta_put_entry( elem->val ); } } - - -void fasta_free_entry( seq_entry *entry ) -{ - /* Martin A. Hansen, June 2008 */ - - /* Deallocates memory from a seq_entry. */ - - mem_free( ( void * ) &entry->seq_name ); - mem_free( ( void * ) &entry->seq ); - mem_free( ( void * ) &entry ); -} - - diff --git a/code_c/Maasha/src/lib/mem.c b/code_c/Maasha/src/lib/mem.c index 5bee88b..03f49d5 100644 --- a/code_c/Maasha/src/lib/mem.c +++ b/code_c/Maasha/src/lib/mem.c @@ -118,3 +118,5 @@ void mem_free( void **ppt ) *ppt = NULL; } + + diff --git a/code_c/Maasha/src/lib/seq.c b/code_c/Maasha/src/lib/seq.c index db88277..1491b2a 100644 --- a/code_c/Maasha/src/lib/seq.c +++ b/code_c/Maasha/src/lib/seq.c @@ -3,6 +3,35 @@ #include "seq.h" +void seq_new( seq_entry **entry_ppt, size_t max_seq_name, size_t max_seq ) +{ + /* Martin A. Hansen, August 2008 */ + + /* Initialize a new sequence entry. */ + + seq_entry *entry = *entry_ppt; + entry = mem_get( sizeof( seq_entry ) ); + entry->seq_name = mem_get( max_seq_name ); + entry->seq = mem_get( max_seq ); + entry->seq_len = 0; + *entry_ppt = entry; +} + + +void seq_destroy( seq_entry *entry ) +{ + /* Martin A. Hansen, August 2008 */ + + /* Destroy a sequence entry. */ + + free( entry->seq_name ); + free( entry->seq ); + free( entry ); + + entry = NULL; +} + + void uppercase_seq( char *seq ) { /* Martin A. Hansen, May 2008 */ diff --git a/code_c/Maasha/src/test/Makefile b/code_c/Maasha/src/test/Makefile index 43ddd58..38dbcb1 100644 --- a/code_c/Maasha/src/test/Makefile +++ b/code_c/Maasha/src/test/Makefile @@ -9,7 +9,7 @@ LIB = -lm $(LIB_DIR)*.o all: test -test: test_common test_fasta test_filesys test_mem test_strings +test: test_common test_fasta test_filesys test_mem test_seq test_strings test_common: test_common.c $(LIB_DIR)common.c $(CC) $(Cflags) $(INC) $(LIB) test_common.c -o test_common @@ -23,6 +23,9 @@ test_filesys: test_filesys.c $(LIB_DIR)filesys.c test_mem: test_mem.c $(LIB_DIR)mem.c $(CC) $(Cflags) $(INC) $(LIB) test_mem.c -o test_mem +test_seq: test_seq.c $(LIB_DIR)seq.c + $(CC) $(Cflags) $(INC) $(LIB) test_seq.c -o test_seq + test_strings: test_strings.c $(LIB_DIR)strings.c $(CC) $(Cflags) $(INC) $(LIB) test_strings.c -o test_strings @@ -31,4 +34,5 @@ clean: rm test_fasta rm test_filesys rm test_mem + rm test_seq rm test_strings diff --git a/code_c/Maasha/src/test/test_fasta.c b/code_c/Maasha/src/test/test_fasta.c index b9641a0..25e193e 100644 --- a/code_c/Maasha/src/test/test_fasta.c +++ b/code_c/Maasha/src/test/test_fasta.c @@ -1,6 +1,7 @@ #include "common.h" #include "filesys.h" #include "mem.h" +#include "seq.h" #include "fasta.h" #define TEST_FILE1 "test/test_files/test.fna" @@ -8,18 +9,16 @@ #define TEST_COUNT 10 #define TEST_SIZE 10 -//static void test_fasta_get_entry(); -static void test_fasta_get_seq_name(); -//static void test_fasta_put_entry(); +static void test_fasta_get_entry(); +static void test_fasta_put_entry(); int main() { fprintf( stderr, "Running all tests for fasta.c\n" ); -// test_fasta_get_entry(); - test_fasta_get_seq_name(); -// test_fasta_put_entry(); + test_fasta_get_entry(); + test_fasta_put_entry(); fprintf( stderr, "Done\n\n" ); @@ -27,73 +26,53 @@ int main() } -//void test_fasta_get_entry() -//{ -// fprintf( stderr, " Testing fasta_get_entry ... " ); -// -// file_buffer *buffer = NULL; -// seq_entry *entry = NULL; -// -// buffer_new( TEST_FILE1, &buffer, TEST_SIZE ); -// -// entry = mem_get( sizeof( seq_entry ) ); -// -// -// while ( fasta_get_entry( &buffer, &entry ) != FALSE ) -// { -// fasta_put_entry( entry ); -//// assert( strlen( entry->seq ) == entry->seq_len ); -// -//// printf( "%s\t%zu\n", entry->seq_name, entry->seq_len ); -// -// } -// -// buffer_destroy( &buffer ); -// -// buffer = NULL; -// -// fprintf( stderr, "OK\n" ); -//} -// - - -void test_fasta_get_seq_name() +void test_fasta_get_entry() { - fprintf( stderr, " Testing fasta_get_seq_name ... " ); + fprintf( stderr, " Testing fasta_get_entry ... " ); - FILE *fp = NULL; - char *seq_name = NULL; + FILE *fp = NULL; + seq_entry *entry = NULL; + size_t max_seq_name = MAX_SEQ_NAME; + size_t max_seq = MAX_SEQ; - fp = read_open( TEST_FILE1 ); + fp = read_open( TEST_FILE2 ); + + seq_new( &entry, max_seq_name, max_seq ); - fasta_get_seq_name( fp, &seq_name ); + while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) { + printf( "seq_name: %s seq_len: %zu\n", entry->seq_name, entry->seq_len ); + } - printf( "SEQ_NAME: ->%s<-\n", seq_name ); + seq_destroy( entry ); - assert( strcmp( seq_name, "test0" ) == 0 ); + close_stream( fp ); fprintf( stderr, "OK\n" ); } -//void test_fasta_put_entry() -//{ -// fprintf( stderr, " Testing fasta_put_entry ... " ); -// -// seq_entry *entry = NULL; -// char *seq_name = "test"; -// char *seq = "ATCG"; -// size_t seq_len = strlen( seq ); -// -// entry = mem_get( sizeof( seq_entry ) ); -// -// entry->seq_name = seq_name; -// entry->seq = seq; -// entry->seq_len = seq_len; -// -// fasta_put_entry( entry ); -// -// fprintf( stderr, "OK\n" ); -//} +void test_fasta_put_entry() +{ + fprintf( stderr, " Testing fasta_put_entry ... " ); + + FILE *fp = NULL; + seq_entry *entry = NULL; + size_t max_seq_name = MAX_SEQ_NAME; + size_t max_seq = MAX_SEQ; + + fp = read_open( TEST_FILE1 ); + + seq_new( &entry, max_seq_name, max_seq ); + + while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) { + fasta_put_entry( entry ); + } + + seq_destroy( entry ); + + close_stream( fp ); + + fprintf( stderr, "OK\n" ); +}