-#define MAX_SEQ_NAME 1024
-#define FASTA_BUFFER 256 * 1024
-
-#define isseq( x ) ( x > 32 && x < 127 ) ? TRUE : FALSE
-
-/* Structure of a sequence entry. */
-struct _seq_entry
-{
- char *seq_name;
- char *seq;
- size_t seq_len;
-};
-
-typedef struct _seq_entry seq_entry;
+#define FASTA_BUFFER 1024
/* Count all entries in a FASTA file given a file pointer. */
size_t fasta_count( FILE *fp );
-/* Get the next FASTA sequence name from a stream. */
-bool fasta_get_seq_name( FILE *fp, char **seq_name_ppt );
-
/* Get next sequence entry from a FASTA file given a file pointer. */
-bool fasta_get_entry( file_buffer **buffer_ppt, seq_entry **entry );
+bool fasta_get_entry( FILE *fp, seq_entry **entry_ppt );
/* Output a sequence entry in FASTA format. */
void fasta_put_entry( seq_entry *entry );
/* Output all sequence entries from a list in FASTA format. */
void fasta_put_entries( struct list *entries );
-
-/* Deallocates memory from a seq_entry. */
-void fasta_free_entry( seq_entry *entry );
+/* Macro to test if a given char is sequence (DNA, RNA, Protein, indels. brackets, etc ). */
+#define isseq( x ) ( x > 32 && x < 127 ) ? 1 : 0
+
/* Macro to test if a given char is DNA. */
-#define dna_clean( c ) ( c == 'A' || c == 'a' || c == 'T' || c == 't' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0
+#define isDNA( c ) ( c == 'A' || c == 'a' || c == 'T' || c == 't' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0
/* Macro to test if a given char is RNA. */
-#define rna_clean( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0
+#define isRNA( c ) ( c == 'A' || c == 'a' || c == 'U' || c == 'u' || c == 'C' || c == 'c' || c == 'G' || c == 'g' || c == 'N' || c == 'n' ) ? 1 : 0
+
+#define MAX_SEQ_NAME 1024
+#define MAX_SEQ 250000000
+
+/* Definition of a sequence entry */
+struct _seq_entry
+{
+ char *seq_name;
+ char *seq;
+ size_t seq_len;
+};
+
+typedef struct _seq_entry seq_entry;
+
+/* Initialize a new sequence entry. */
+void seq_new( seq_entry **entry_ppt, size_t max_seq_name, size_t max_seq );
+
+/* Destroy a sequence entry. */
+void seq_destroy( seq_entry *entry );
/* Uppercase sequence. */
void uppercase_seq( char *seq );
void lowercase_seq( char *seq );
/* Reverse compliments DNA sequence. */
-void revcomp_dna( char *seq );
+void revcomp_dna( char *seq );
/* Reverse compliments RNA sequence. */
-void revcomp_rna( char *seq );
+void revcomp_rna( char *seq );
/* Reverse compliment nucleotide sequnce after guessing the sequence type. */
-void revcomp_nuc( char *seq );
+void revcomp_nuc( char *seq );
/* Complement DNA sequence. (NB it is not reversed!). */
-void complement_dna( char *seq );
+void complement_dna( char *seq );
/* Complement RNA sequence. (NB it is not reversed!). */
-void complement_rna( char *seq );
+void complement_rna( char *seq );
/* Complement nucleotide sequence after guessing the sequence type. */
-void complement_nuc( char *seq );
+void complement_nuc( char *seq );
/* Reverse sequence. */
-void reverse( char *seq );
+void reverse( char *seq );
/* Convert all non-nucleotide letters to Ns. */
-void seq2nuc_simple( char *seq );
+void seq2nuc_simple( char *seq );
/* Convert DNA into RNA by change t and T to u and U, respectively. */
-void dna2rna( char *seq );
+void dna2rna( char *seq );
/* Convert RNA into DNA by change u and U to t and T, respectively. */
-void rna2dna( char *seq );
+void rna2dna( char *seq );
/* Check if a sequence is DNA by inspecting the first 100 residues. */
-bool is_dna( char *seq );
+bool is_dna( char *seq );
/* Check if a sequence is RNA by inspecting the first 100 residues. */
-bool is_rna( char *seq );
+bool is_rna( char *seq );
/* Check if a sequence is protein by inspecting the first 100 residues. */
-bool is_protein( char *seq );
+bool is_protein( char *seq );
/* Guess if a sequence is DNA, RNA, or protein by inspecting the first 100 residues. */
char *seq_guess_type( char *seq );
bool contain_N( char *seq );
/* Pack a nucleotide oligo (max length 15) into a binary/integer (good for hash keys). */
-int oligo2bin( char *oligo );
+int oligo2bin( char *oligo );
+
#include "common.h"
#include "mem.h"
#include "filesys.h"
+#include "seq.h"
#include "fasta.h"
#include "list.h"
+#include "strings.h"
size_t fasta_count( FILE *fp )
}
-bool fasta_get_entry( file_buffer **buffer_ppt, seq_entry **entry_ppt )
+bool fasta_get_entry( FILE *fp, seq_entry **entry_ppt )
{
/* Martin A. Hansen, August 2008 */
- /* Get next sequence entry from a FASTA file given a file buffer. */
+ /* Get next sequence entry from a FASTA file given a file pointer. */
- file_buffer *buffer = *buffer_ppt;
- char *line = NULL;
- size_t seq_name_len = 0;
- size_t seq_len = 0;
- char *seq_name = NULL;
- char *seq = NULL;
- size_t i = 0;
+ seq_entry *entry = *entry_ppt;
- while ( 1 )
- {
- if ( ( line = buffer_gets( buffer ) ) != NULL )
- {
- if ( line[ 0 ] == '>' )
- {
- seq_name_len = buffer->token_len - 2;
- seq_name = mem_get( seq_name_len + 1 );
-
- memcpy( seq_name, &line[ 1 ], seq_name_len );
-
- break;
- }
- }
- else
- {
- return FALSE;
- }
- }
-
-// printf( "SEQ_NAME: ->%s<-\n", seq_name );
-
- seq = mem_get( 1 );
+ entry->seq_len = 0;
- seq_len = 0;
+ char c;
+ char *pt;
- while ( 1 )
- {
- if ( ( line = buffer_gets( buffer ) ) != NULL )
- {
- if ( line[ 0 ] == '>' )
- {
- buffer_ungets( buffer );
-
- break;
- }
- else
- {
-// mem_resize( seq, seq_len + buffer->token_len );
-
- for ( i = 0; line[ i ]; i++ )
- {
- if ( isseq( line[ i ] ) )
- {
- seq[ seq_len ] = line[ i ];
-
- seq_len++;
- }
- }
- }
- }
- else
- {
- break;
- }
+ while ( ( c = fgetc( fp ) ) && c != '>' && c != EOF ) {
}
- if ( seq_len == 0 )
- {
+ if ( ( ferror( fp ) != 0 ) ) {
+ fprintf( stderr, "ERROR: Could not read from file: %s\n", strerror( errno ) );
+ } else if ( ( feof( fp ) != 0 ) ) {
return FALSE;
}
-// seq = mem_resize( seq, seq_len );
-
- seq[ seq_len ] = '\0';
-
-// printf( "SEQ: ->%s<-\n", seq );
-
- ( *entry_ppt )->seq_name = seq_name;
- ( *entry_ppt )->seq = seq;
- ( *entry_ppt )->seq_len = seq_len;
- *buffer_ppt = buffer;
+ pt = entry->seq_name;
- return TRUE;
-}
-
-
-bool fasta_get_seq_name( FILE *fp, char **seq_name_ppt )
-{
- /* Martin A. Hansen, August 2008 */
+ while ( ( c = fgetc( fp ) ) && c != '\n' ) {
+ *( pt++ ) = c;
+ }
- /* Get the next FASTA sequence name from a stream. */
+ *( pt ) = '\0';
- char *pt;
- char buffer[ MAX_SEQ_NAME ];
+ pt = entry->seq;
- while ( 1 )
+ while ( ( c = fgetc( fp ) ) && c != '>' && c != EOF )
{
- if ( ( pt = fgets( buffer, MAX_SEQ_NAME, fp ) ) )
+ if ( isseq( c ) )
{
- printf( "buffer: %s\n", buffer );
-
- if ( buffer[ 0 ] == '>' )
- {
- *seq_name_ppt = &buffer[ 1 ];
-
- return TRUE;
- }
+ *( pt++ ) = c;
+ entry->seq_len++;
}
}
- return FALSE;
-}
+ *( pt ) = '\0';
+ if ( c == '>' ) {
+ ungetc( c, fp );
+ }
-//bool fasta_get_entry( FILE *fp, seq_entry **entry )
-//{
-// /* Martin A. Hansen, May 2008 */
-//
-// /* Unit test done.*/
-//
-// /* Get next sequence entry from a FASTA file given a file pointer. */
-//
-// size_t i;
-// size_t offset;
-// size_t seq_buffer_len;
-// size_t seq_len;
-// size_t buffer_read;
-// char buffer[ FASTA_BUFFER ];
-// size_t buffer_len;
-// char *seq_name = NULL;
-// char *seq = NULL;
-//
-// offset = ftell( fp );
-//
-// /* ---- Skip ahead until header line and include header ---- */
-//
-// while ( 1 )
-// {
-// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
-// {
-// buffer_len = strlen( buffer );
-//
-// offset += buffer_len;
-//
-// if ( ( buffer[ 0 ] == '>' ) )
-// {
-// seq_name = mem_get( buffer_len - 1 );
-//
-// memcpy( seq_name, &buffer[ 1 ], buffer_len - 2 );
-//
-// seq_name[ buffer_len - 2 ] = '\0';
-//
-// break;
-// }
-// }
-// else
-// {
-// if ( ferror( fp ) != 0 )
-// {
-// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) );
-// abort();
-// }
-// else if ( feof( fp ) != 0 )
-// {
-// return FALSE;
-// }
-// }
-// }
-//
-// /* ---- Determine approximate length of sequence ---- */
-//
-// seq_buffer_len = 0;
-//
-// while ( 1 )
-// {
-// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
-// {
-// if ( ( buffer[ 0 ] == '>' ) )
-// {
-// assert( seq_buffer_len != 0 );
-//
-// break;
-// }
-// else
-// {
-// seq_buffer_len += strlen( buffer );
-// }
-// }
-// else
-// {
-// if ( ferror( fp ) != 0 )
-// {
-// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) );
-// abort();
-// }
-// else if ( feof( fp ) != 0 )
-// {
-// break;
-// }
-// }
-// }
-//
-// /* ---- Allocate approximate memory for sequence ---- */
-//
-// seq = mem_get( seq_buffer_len + 1 );
-//
-// /* ---- Rewind file pointer and read sequence ---- */
-//
-// if ( fseek( fp, offset, SEEK_SET ) != 0 )
-// {
-// fprintf( stderr, "ERROR: fseek SEEK_SET failed: %s\n", strerror( errno ) );
-// abort();
-// }
-//
-// buffer_read = 0;
-// seq_len = 0;
-//
-// while ( buffer_read < seq_buffer_len )
-// {
-// if ( fgets( buffer, sizeof( buffer ), fp ) != NULL )
-// {
-// for ( i = 0; buffer[ i ]; i++ )
-// {
-// if ( buffer[ i ] > 32 && buffer[ i ] < 127 )
-// {
-// seq[ seq_len ] = buffer[ i ];
-//
-// seq_len++;
-// }
-// }
-//
-// buffer_read += i;
-// }
-// else
-// {
-// if ( ferror( fp ) != 0 )
-// {
-// fprintf( stderr, "ERROR: get_fasta_seq failed: %s\n", strerror( errno ) );
-// abort();
-// }
-// else if ( feof( fp ) != 0 )
-// {
-// fprintf( stderr, "ERROR: get_fasta_seq failed: EOF\n" );
-// abort();
-// }
-// }
-// }
-//
-//// seq = mem_resize( seq, seq_len + 1 );
-//
-// seq[ seq_len + 1 ] = '\0';
-//
-// ( *entry )->seq_name = seq_name;
-// ( *entry )->seq = seq;
-// ( *entry )->seq_len = seq_len;
-//
-// return TRUE;
-//}
+ *entry_ppt = entry;
+
+ return TRUE;
+}
void fasta_put_entry( seq_entry *entry )
fasta_put_entry( elem->val );
}
}
-
-
-void fasta_free_entry( seq_entry *entry )
-{
- /* Martin A. Hansen, June 2008 */
-
- /* Deallocates memory from a seq_entry. */
-
- mem_free( ( void * ) &entry->seq_name );
- mem_free( ( void * ) &entry->seq );
- mem_free( ( void * ) &entry );
-}
-
-
#include "seq.h"
+void seq_new( seq_entry **entry_ppt, size_t max_seq_name, size_t max_seq )
+{
+ /* Martin A. Hansen, August 2008 */
+
+ /* Initialize a new sequence entry. */
+
+ seq_entry *entry = *entry_ppt;
+ entry = mem_get( sizeof( seq_entry ) );
+ entry->seq_name = mem_get( max_seq_name );
+ entry->seq = mem_get( max_seq );
+ entry->seq_len = 0;
+ *entry_ppt = entry;
+}
+
+
+void seq_destroy( seq_entry *entry )
+{
+ /* Martin A. Hansen, August 2008 */
+
+ /* Destroy a sequence entry. */
+
+ free( entry->seq_name );
+ free( entry->seq );
+ free( entry );
+
+ entry = NULL;
+}
+
+
void uppercase_seq( char *seq )
{
/* Martin A. Hansen, May 2008 */
all: test
-test: test_common test_fasta test_filesys test_mem test_strings
+test: test_common test_fasta test_filesys test_mem test_seq test_strings
test_common: test_common.c $(LIB_DIR)common.c
$(CC) $(Cflags) $(INC) $(LIB) test_common.c -o test_common
test_mem: test_mem.c $(LIB_DIR)mem.c
$(CC) $(Cflags) $(INC) $(LIB) test_mem.c -o test_mem
+test_seq: test_seq.c $(LIB_DIR)seq.c
+ $(CC) $(Cflags) $(INC) $(LIB) test_seq.c -o test_seq
+
test_strings: test_strings.c $(LIB_DIR)strings.c
$(CC) $(Cflags) $(INC) $(LIB) test_strings.c -o test_strings
rm test_fasta
rm test_filesys
rm test_mem
+ rm test_seq
rm test_strings
#include "common.h"
#include "filesys.h"
#include "mem.h"
+#include "seq.h"
#include "fasta.h"
#define TEST_FILE1 "test/test_files/test.fna"
#define TEST_COUNT 10
#define TEST_SIZE 10
-//static void test_fasta_get_entry();
-static void test_fasta_get_seq_name();
-//static void test_fasta_put_entry();
+static void test_fasta_get_entry();
+static void test_fasta_put_entry();
int main()
{
fprintf( stderr, "Running all tests for fasta.c\n" );
-// test_fasta_get_entry();
- test_fasta_get_seq_name();
-// test_fasta_put_entry();
+ test_fasta_get_entry();
+ test_fasta_put_entry();
fprintf( stderr, "Done\n\n" );
}
-//void test_fasta_get_entry()
-//{
-// fprintf( stderr, " Testing fasta_get_entry ... " );
-//
-// file_buffer *buffer = NULL;
-// seq_entry *entry = NULL;
-//
-// buffer_new( TEST_FILE1, &buffer, TEST_SIZE );
-//
-// entry = mem_get( sizeof( seq_entry ) );
-//
-//
-// while ( fasta_get_entry( &buffer, &entry ) != FALSE )
-// {
-// fasta_put_entry( entry );
-//// assert( strlen( entry->seq ) == entry->seq_len );
-//
-//// printf( "%s\t%zu\n", entry->seq_name, entry->seq_len );
-//
-// }
-//
-// buffer_destroy( &buffer );
-//
-// buffer = NULL;
-//
-// fprintf( stderr, "OK\n" );
-//}
-//
-
-
-void test_fasta_get_seq_name()
+void test_fasta_get_entry()
{
- fprintf( stderr, " Testing fasta_get_seq_name ... " );
+ fprintf( stderr, " Testing fasta_get_entry ... " );
- FILE *fp = NULL;
- char *seq_name = NULL;
+ FILE *fp = NULL;
+ seq_entry *entry = NULL;
+ size_t max_seq_name = MAX_SEQ_NAME;
+ size_t max_seq = MAX_SEQ;
- fp = read_open( TEST_FILE1 );
+ fp = read_open( TEST_FILE2 );
+
+ seq_new( &entry, max_seq_name, max_seq );
- fasta_get_seq_name( fp, &seq_name );
+ while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) {
+ printf( "seq_name: %s seq_len: %zu\n", entry->seq_name, entry->seq_len );
+ }
- printf( "SEQ_NAME: ->%s<-\n", seq_name );
+ seq_destroy( entry );
- assert( strcmp( seq_name, "test0" ) == 0 );
+ close_stream( fp );
fprintf( stderr, "OK\n" );
}
-//void test_fasta_put_entry()
-//{
-// fprintf( stderr, " Testing fasta_put_entry ... " );
-//
-// seq_entry *entry = NULL;
-// char *seq_name = "test";
-// char *seq = "ATCG";
-// size_t seq_len = strlen( seq );
-//
-// entry = mem_get( sizeof( seq_entry ) );
-//
-// entry->seq_name = seq_name;
-// entry->seq = seq;
-// entry->seq_len = seq_len;
-//
-// fasta_put_entry( entry );
-//
-// fprintf( stderr, "OK\n" );
-//}
+void test_fasta_put_entry()
+{
+ fprintf( stderr, " Testing fasta_put_entry ... " );
+
+ FILE *fp = NULL;
+ seq_entry *entry = NULL;
+ size_t max_seq_name = MAX_SEQ_NAME;
+ size_t max_seq = MAX_SEQ;
+
+ fp = read_open( TEST_FILE1 );
+
+ seq_new( &entry, max_seq_name, max_seq );
+
+ while ( ( fasta_get_entry( fp, &entry ) != FALSE ) ) {
+ fasta_put_entry( entry );
+ }
+
+ seq_destroy( entry );
+
+ close_stream( fp );
+
+ fprintf( stderr, "OK\n" );
+}