clean:
cd $(LIB_DIR) && ${MAKE} clean
cd $(TEST_DIR) && ${MAKE} clean
- rm -f fasta_count repeat-O-matic
+ rm fasta_count
+ rm repeat-O-matic
#include <errno.h>
/* Define a shorthand for unsigned int */
-#define uint unsigned int
+//typedef uint unsigned int
/* Define a boolean type */
#define bool char
size_t seq_len;
};
+typedef struct seq_entry seq_entry;
+
/* Count all entries in a FASTA file given a file pointer. */
-uint fasta_count( FILE *fp );
+size_t fasta_count( FILE *fp );
/* Get next sequence entry from a FASTA file given a file pointer. */
-bool fasta_get_entry( FILE *fp, struct seq_entry *entry );
+bool fasta_get_entry( FILE *fp, seq_entry **entry );
/* Output a sequence entry in FASTA format. */
-void fasta_put_entry( struct seq_entry *entry );
+void fasta_put_entry( seq_entry *entry );
/* Get all sequence entries from a FASTA file in a list. */
void fasta_get_entries( FILE *fp, struct list **entries );
#include "list.h"
-uint fasta_count( FILE *fp )
+size_t fasta_count( FILE *fp )
{
/* Martin A. Hansen, May 2008 */
/* Counts all entries in a FASTA file given a file pointer. */
- char buffer[ FASTA_BUFFER ];
- uint count;
+ char buffer[ FASTA_BUFFER ];
+ size_t count;
count = 0;
}
-bool fasta_get_entry( FILE *fp, struct seq_entry *entry )
+bool fasta_get_entry( FILE *fp, seq_entry **entry )
{
/* Martin A. Hansen, May 2008 */
/* Get next sequence entry from a FASTA file given a file pointer. */
- int i;
+ size_t i;
size_t j;
size_t offset;
size_t seq_len;
char buffer[ FASTA_BUFFER ];
- int buffer_len;
+ size_t buffer_len;
char *seq_name = NULL;
char *seq = NULL;
- entry = mem_get( sizeof( entry ) );
-
offset = ftell( fp );
/* ---- Skip ahead until header line and include header ---- */
if ( ( buffer[ 0 ] == '>' ) )
{
- seq_name = mem_get_zero( buffer_len - 1 );
+ seq_name = mem_get( buffer_len - 1 );
for ( i = 1; i < buffer_len - 1; i++ ) {
seq_name[ i - 1 ] = buffer[ i ];
/* ---- Allocate memory for sequence ---- */
- seq = mem_get_zero( seq_len + 1 );
+ seq = mem_get( seq_len + 1 );
/* ---- Rewind file pointer and read sequence ---- */
- if ( fseek( fp, offset, SEEK_SET ) < 0 ) {
- die( "fseek SEEK_SET failed." );
+ if ( fseek( fp, offset, SEEK_SET ) != 0 )
+ {
+ fprintf( stderr, "ERROR: fseek SEEK_SET failed: %s\n", strerror( errno ) );
+ abort();
}
j = 0;
{
seq[ j + 1 ] = '\0';
- entry->seq_name = seq_name;
- entry->seq = seq;
- entry->seq_len = seq_len;
+ ( *entry )->seq_name = seq_name;
+ ( *entry )->seq = seq;
+ ( *entry )->seq_len = seq_len;
return TRUE;
}
}
-void fasta_get_entries( FILE *fp, struct list **entries )
-{
- /* Martin A. Hansen, May 2008 */
-
- /* Given a file pointer to a FASTA file retreives all */
- /* sequence entries and insert those in a list. */
-
- struct seq_entry *entry;
-
- while ( 1 )
- {
- entry = mem_get( sizeof( entry ) );
-
- if ( ! fasta_get_entry( fp, entry ) ) {
- break;
- }
-
- list_add( entries, entry );
- }
-
- list_reverse( entries );
-}
+//void fasta_get_entries( FILE *fp, struct list **entries )
+//{
+// /* Martin A. Hansen, May 2008 */
+//
+// /* Given a file pointer to a FASTA file retreives all */
+// /* sequence entries and insert those in a list. */
+//
+// struct seq_entry *entry;
+//
+// while ( 1 )
+// {
+// entry = mem_get( sizeof( entry ) );
+//
+// if ( ! fasta_get_entry( fp, entry ) ) {
+// break;
+// }
+//
+// list_add( entries, entry );
+// }
+//
+// list_reverse( entries );
+//}
void fasta_put_entries( struct list *entries )
all: test
-test: test_filesys test_mem test_strings
+test: test_fasta test_filesys test_mem test_strings
+
+test_fasta: test_fasta.c $(LIB_DIR)fasta.c
+ $(CC) $(Cflags) $(INC) $(LIB) test_fasta.c -o test_fasta
test_filesys: test_filesys.c $(LIB_DIR)filesys.c
$(CC) $(Cflags) $(INC) $(LIB) test_filesys.c -o test_filesys
$(CC) $(Cflags) $(INC) $(LIB) test_strings.c -o test_strings
clean:
- rm -f test_filesys
- rm -f test_mem
- rm -f test_strings
+ rm test_fasta
+ rm test_filesys
+ rm test_mem
+ rm test_strings
--- /dev/null
+#include "common.h"
+#include "filesys.h"
+#include "mem.h"
+#include "fasta.h"
+
+#define TEST_FILE "test/test_files/test.fna"
+
+static void test_fasta_get_entry();
+
+int main()
+{
+ fprintf( stderr, "Running all tests for fasta.c\n" );
+
+ test_fasta_get_entry();
+
+ fprintf( stderr, "Done\n\n" );
+
+ return EXIT_SUCCESS;
+}
+
+
+void test_fasta_get_entry()
+{
+ fprintf( stderr, " Testing fasta_get_entry ... " );
+
+ FILE *fp;
+ seq_entry *entry;
+
+ fp = read_open( TEST_FILE );
+
+ entry = mem_get( sizeof( entry ) );
+
+ if ( fasta_get_entry( fp, &entry ) != FALSE )
+ {
+ assert( strlen( entry->seq_name ) == 5 );
+ assert( strlen( entry->seq ) == 60 );
+ assert( entry->seq_len == 60 );
+ assert( strlen( entry->seq ) == entry->seq_len );
+ }
+
+ close_stream( fp );
+
+ fprintf( stderr, "OK\n" );
+}
+
+
#include "common.h"
#include "filesys.h"
-#include <errno.h>
static void test_read_open();
static void test_write_open();
$test_dir = "test";
@tests = qw(
- test_mem
+ test_fasta
test_filesys
+ test_mem
test_strings
);
+++ /dev/null
-#include "common.h"
-#include "filesys.h"
-#include "list.h"
-#include "fasta.h"
-
-int main()
-{
- char *file = NULL;
- FILE *fp = NULL;
- struct seq_entry *entry = NULL;
-
- file = "/Users/m.hansen/test.fna";
-
- fp = read_open( file );
-
- while ( ( fasta_get_entry( fp, entry ) != NULL ) )
- {
- printf( "seq_name: %s seq: %s seq_len: %i\n", entry->seq_name, entry->seq, entry->seq_len );
- }
-
- close_stream( fp );
-
- return 0;
-}
-
-/*
-
-int main()
-{
- char *file = "/Users/m.hansen/test.fna";
- FILE *fp;
-
- struct list *entries = NULL;
-
- fp = read_open( file );
-
- fasta_get_entries( fp, &entries );
-
- fasta_put_entries( entries);
- close_stream( fp );
-
- return 0;
-}
-
-*/
use Maasha::GFF;
use Maasha::TwoBit;
use Maasha::Solid;
+use Maasha::Solexa;
use Maasha::SQL;
use Maasha::Gwiki;
use constant {
SEQ_NAME => 0,
SEQ => 1,
+ SCORE => 2,
};
# Returns nothing.
- my ( $record, $file, $base_name, $data_in, $line, $num, @fields, @seqs, @scores, $i, $seq, $seq_count );
+ my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i );
$options->{ "quality" } ||= 20;
foreach $file ( @{ $options->{ "files" } } )
{
- $data_in = Maasha::Common::read_open( $file );
- $base_name = Maasha::Common::get_basename( $file );
- $base_name =~ s/\..*//;
-
- $seq_count = 0;
+ $data_in = Maasha::Common::read_open( $file );
- while ( $line = <$data_in> )
+ while ( $entry = Maasha::Solexa::solexa_get_entry( $data_in ) )
{
- @fields = split /:/, $line;
- @seqs = split //, $fields[ 5 ];
- @scores = split / /, $fields[ -1 ];
+ @seqs = split //, $entry->[ SEQ ];
+ @scores = split /:/, $entry->[ SCORE ];
for ( $i = 0; $i < @scores; $i++ ) {
$seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" };
}
- $seq = join "", @seqs;
-
- $record->{ "SEQ_NAME" } = sprintf( "%s_ID%08d", $base_name, $seq_count );
- $record->{ "SEQ" } = $seq;
- $record->{ "SEQ_LEN" } = length $seq;
+ $record->{ "SEQ_NAME" } = $entry->[ SEQ_NAME ];
+ $record->{ "SEQ" } = join "", @seqs;
+ $record->{ "SEQ_LEN" } = scalar @seqs;
$record->{ "SCORE_MEAN" } = sprintf ( "%.2f", Maasha::Calc::mean( \@scores ) );
put_record( $record, $out );
goto NUM if $options->{ "num" } and $num == $options->{ "num" };
- $seq_count++;
$num++;
}
--- /dev/null
+package Maasha::Solexa;
+
+
+# Copyright (C) 2007-2008 Martin A. Hansen.
+
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+
+# http://www.gnu.org/copyleft/gpl.html
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+# Routines for manipulation Solid sequence files with di-base encoding.
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+use vars qw( @ISA @EXPORT_OK );
+
+require Exporter;
+
+@ISA = qw( Exporter );
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub solexa_get_entry
+{
+ # Martin A. Hansen, August 2008
+
+ # Get the next Solexa entry form a file and returns
+ # a triple of [ seq_name, seq, score ].
+ # We asume a Solexa entry consists of four lines:
+
+ # @HWI-EAS157_20FFGAAXX:2:1:888:434
+ # TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGG
+ # +HWI-EAS157_20FFGAAXX:2:1:888:434
+ # hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhKhe
+
+ my ( $fh, # filehandle to Solexa file
+ ) = @_;
+
+ # Returns a list.
+
+ my ( $seq_header, $seq, $score_head, $score, @scores );
+
+ $seq_header = <$fh>;
+ $seq = <$fh>;
+ $score_header = <$fh>;
+ $score = <$fh>;
+
+ return if not defined $score;
+
+ chomp $seq_header;
+ chomp $seq;
+ chomp $score_header;
+ chomp $score;
+
+ @scores = split //, $score;
+
+ map { $_ = score_oct2dec( $_ ) } @scores;
+
+ $seq_header =~ s/^@//;
+
+ $score = join( ":", @scores );
+
+ return wantarray ? ( $seq_header, $seq, $score ) : [ $seq_header, $seq, $score ];
+}
+
+
+sub score_oct2dec
+{
+ # Martin A. Hansen, August 2008
+
+ # Converts a Solexa score in octal format to decimal format.
+
+ # http://maq.sourceforge.net/fastq.shtml
+
+ my ( $char, # octal score
+ ) = @_;
+
+ # Returns a float
+
+ return 10 * log(1 + 10 ** ((ord( $char ) - 64) / 10.0)) / log(10);
+}
+
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+1;