From 14187b3723d681ef33f72e107b85ebdba7cab3a0 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 14 Aug 2008 08:35:21 +0000 Subject: [PATCH] updated read_solexa git-svn-id: http://biopieces.googlecode.com/svn/trunk@208 74ccb610-7750-0410-82ae-013aeee3265d --- code_c/Maasha/src/Makefile | 3 +- code_c/Maasha/src/inc/common.h | 2 +- code_c/Maasha/src/inc/fasta.h | 8 +- code_c/Maasha/src/lib/fasta.c | 74 +++++++++--------- code_c/Maasha/src/test/Makefile | 12 ++- code_c/Maasha/src/test/test_fasta.c | 46 +++++++++++ code_c/Maasha/src/test/test_filesys.c | 1 - code_c/Maasha/src/test_all.pl | 3 +- code_c/Maasha/src/test_fasta.c | 45 ----------- code_perl/Maasha/Biopieces.pm | 26 +++---- code_perl/Maasha/Solexa.pm | 106 ++++++++++++++++++++++++++ 11 files changed, 217 insertions(+), 109 deletions(-) create mode 100644 code_c/Maasha/src/test/test_fasta.c delete mode 100644 code_c/Maasha/src/test_fasta.c create mode 100644 code_perl/Maasha/Solexa.pm diff --git a/code_c/Maasha/src/Makefile b/code_c/Maasha/src/Makefile index 81fbdf3..25191a5 100644 --- a/code_c/Maasha/src/Makefile +++ b/code_c/Maasha/src/Makefile @@ -26,4 +26,5 @@ repeat-O-matic: repeat-O-matic.c clean: cd $(LIB_DIR) && ${MAKE} clean cd $(TEST_DIR) && ${MAKE} clean - rm -f fasta_count repeat-O-matic + rm fasta_count + rm repeat-O-matic diff --git a/code_c/Maasha/src/inc/common.h b/code_c/Maasha/src/inc/common.h index 0401dd1..362fd6b 100644 --- a/code_c/Maasha/src/inc/common.h +++ b/code_c/Maasha/src/inc/common.h @@ -10,7 +10,7 @@ #include /* Define a shorthand for unsigned int */ -#define uint unsigned int +//typedef uint unsigned int /* Define a boolean type */ #define bool char diff --git a/code_c/Maasha/src/inc/fasta.h b/code_c/Maasha/src/inc/fasta.h index 6151876..671c449 100644 --- a/code_c/Maasha/src/inc/fasta.h +++ b/code_c/Maasha/src/inc/fasta.h @@ -8,14 +8,16 @@ struct seq_entry size_t seq_len; }; +typedef struct seq_entry seq_entry; + /* Count all entries in a FASTA file given a file pointer. */ -uint fasta_count( FILE *fp ); +size_t fasta_count( FILE *fp ); /* Get next sequence entry from a FASTA file given a file pointer. */ -bool fasta_get_entry( FILE *fp, struct seq_entry *entry ); +bool fasta_get_entry( FILE *fp, seq_entry **entry ); /* Output a sequence entry in FASTA format. */ -void fasta_put_entry( struct seq_entry *entry ); +void fasta_put_entry( seq_entry *entry ); /* Get all sequence entries from a FASTA file in a list. */ void fasta_get_entries( FILE *fp, struct list **entries ); diff --git a/code_c/Maasha/src/lib/fasta.c b/code_c/Maasha/src/lib/fasta.c index b4e929f..87458a0 100644 --- a/code_c/Maasha/src/lib/fasta.c +++ b/code_c/Maasha/src/lib/fasta.c @@ -4,14 +4,14 @@ #include "list.h" -uint fasta_count( FILE *fp ) +size_t fasta_count( FILE *fp ) { /* Martin A. Hansen, May 2008 */ /* Counts all entries in a FASTA file given a file pointer. */ - char buffer[ FASTA_BUFFER ]; - uint count; + char buffer[ FASTA_BUFFER ]; + size_t count; count = 0; @@ -26,23 +26,21 @@ uint fasta_count( FILE *fp ) } -bool fasta_get_entry( FILE *fp, struct seq_entry *entry ) +bool fasta_get_entry( FILE *fp, seq_entry **entry ) { /* Martin A. Hansen, May 2008 */ /* Get next sequence entry from a FASTA file given a file pointer. */ - int i; + size_t i; size_t j; size_t offset; size_t seq_len; char buffer[ FASTA_BUFFER ]; - int buffer_len; + size_t buffer_len; char *seq_name = NULL; char *seq = NULL; - entry = mem_get( sizeof( entry ) ); - offset = ftell( fp ); /* ---- Skip ahead until header line and include header ---- */ @@ -55,7 +53,7 @@ bool fasta_get_entry( FILE *fp, struct seq_entry *entry ) if ( ( buffer[ 0 ] == '>' ) ) { - seq_name = mem_get_zero( buffer_len - 1 ); + seq_name = mem_get( buffer_len - 1 ); for ( i = 1; i < buffer_len - 1; i++ ) { seq_name[ i - 1 ] = buffer[ i ]; @@ -90,12 +88,14 @@ bool fasta_get_entry( FILE *fp, struct seq_entry *entry ) /* ---- Allocate memory for sequence ---- */ - seq = mem_get_zero( seq_len + 1 ); + seq = mem_get( seq_len + 1 ); /* ---- Rewind file pointer and read sequence ---- */ - if ( fseek( fp, offset, SEEK_SET ) < 0 ) { - die( "fseek SEEK_SET failed." ); + if ( fseek( fp, offset, SEEK_SET ) != 0 ) + { + fprintf( stderr, "ERROR: fseek SEEK_SET failed: %s\n", strerror( errno ) ); + abort(); } j = 0; @@ -112,9 +112,9 @@ bool fasta_get_entry( FILE *fp, struct seq_entry *entry ) { seq[ j + 1 ] = '\0'; - entry->seq_name = seq_name; - entry->seq = seq; - entry->seq_len = seq_len; + ( *entry )->seq_name = seq_name; + ( *entry )->seq = seq; + ( *entry )->seq_len = seq_len; return TRUE; } @@ -137,28 +137,28 @@ void fasta_put_entry( struct seq_entry *entry ) } -void fasta_get_entries( FILE *fp, struct list **entries ) -{ - /* Martin A. Hansen, May 2008 */ - - /* Given a file pointer to a FASTA file retreives all */ - /* sequence entries and insert those in a list. */ - - struct seq_entry *entry; - - while ( 1 ) - { - entry = mem_get( sizeof( entry ) ); - - if ( ! fasta_get_entry( fp, entry ) ) { - break; - } - - list_add( entries, entry ); - } - - list_reverse( entries ); -} +//void fasta_get_entries( FILE *fp, struct list **entries ) +//{ +// /* Martin A. Hansen, May 2008 */ +// +// /* Given a file pointer to a FASTA file retreives all */ +// /* sequence entries and insert those in a list. */ +// +// struct seq_entry *entry; +// +// while ( 1 ) +// { +// entry = mem_get( sizeof( entry ) ); +// +// if ( ! fasta_get_entry( fp, entry ) ) { +// break; +// } +// +// list_add( entries, entry ); +// } +// +// list_reverse( entries ); +//} void fasta_put_entries( struct list *entries ) diff --git a/code_c/Maasha/src/test/Makefile b/code_c/Maasha/src/test/Makefile index d781cb4..a74cfc3 100644 --- a/code_c/Maasha/src/test/Makefile +++ b/code_c/Maasha/src/test/Makefile @@ -9,7 +9,10 @@ LIB = -lm $(LIB_DIR)*.o all: test -test: test_filesys test_mem test_strings +test: test_fasta test_filesys test_mem test_strings + +test_fasta: test_fasta.c $(LIB_DIR)fasta.c + $(CC) $(Cflags) $(INC) $(LIB) test_fasta.c -o test_fasta test_filesys: test_filesys.c $(LIB_DIR)filesys.c $(CC) $(Cflags) $(INC) $(LIB) test_filesys.c -o test_filesys @@ -21,6 +24,7 @@ test_strings: test_strings.c $(LIB_DIR)strings.c $(CC) $(Cflags) $(INC) $(LIB) test_strings.c -o test_strings clean: - rm -f test_filesys - rm -f test_mem - rm -f test_strings + rm test_fasta + rm test_filesys + rm test_mem + rm test_strings diff --git a/code_c/Maasha/src/test/test_fasta.c b/code_c/Maasha/src/test/test_fasta.c new file mode 100644 index 0000000..33a73d5 --- /dev/null +++ b/code_c/Maasha/src/test/test_fasta.c @@ -0,0 +1,46 @@ +#include "common.h" +#include "filesys.h" +#include "mem.h" +#include "fasta.h" + +#define TEST_FILE "test/test_files/test.fna" + +static void test_fasta_get_entry(); + +int main() +{ + fprintf( stderr, "Running all tests for fasta.c\n" ); + + test_fasta_get_entry(); + + fprintf( stderr, "Done\n\n" ); + + return EXIT_SUCCESS; +} + + +void test_fasta_get_entry() +{ + fprintf( stderr, " Testing fasta_get_entry ... " ); + + FILE *fp; + seq_entry *entry; + + fp = read_open( TEST_FILE ); + + entry = mem_get( sizeof( entry ) ); + + if ( fasta_get_entry( fp, &entry ) != FALSE ) + { + assert( strlen( entry->seq_name ) == 5 ); + assert( strlen( entry->seq ) == 60 ); + assert( entry->seq_len == 60 ); + assert( strlen( entry->seq ) == entry->seq_len ); + } + + close_stream( fp ); + + fprintf( stderr, "OK\n" ); +} + + diff --git a/code_c/Maasha/src/test/test_filesys.c b/code_c/Maasha/src/test/test_filesys.c index 9624668..2dbc11a 100644 --- a/code_c/Maasha/src/test/test_filesys.c +++ b/code_c/Maasha/src/test/test_filesys.c @@ -1,6 +1,5 @@ #include "common.h" #include "filesys.h" -#include static void test_read_open(); static void test_write_open(); diff --git a/code_c/Maasha/src/test_all.pl b/code_c/Maasha/src/test_all.pl index 0868fa4..ba9d92a 100755 --- a/code_c/Maasha/src/test_all.pl +++ b/code_c/Maasha/src/test_all.pl @@ -8,8 +8,9 @@ my ( $test_dir, @tests, $test ); $test_dir = "test"; @tests = qw( - test_mem + test_fasta test_filesys + test_mem test_strings ); diff --git a/code_c/Maasha/src/test_fasta.c b/code_c/Maasha/src/test_fasta.c deleted file mode 100644 index f966764..0000000 --- a/code_c/Maasha/src/test_fasta.c +++ /dev/null @@ -1,45 +0,0 @@ -#include "common.h" -#include "filesys.h" -#include "list.h" -#include "fasta.h" - -int main() -{ - char *file = NULL; - FILE *fp = NULL; - struct seq_entry *entry = NULL; - - file = "/Users/m.hansen/test.fna"; - - fp = read_open( file ); - - while ( ( fasta_get_entry( fp, entry ) != NULL ) ) - { - printf( "seq_name: %s seq: %s seq_len: %i\n", entry->seq_name, entry->seq, entry->seq_len ); - } - - close_stream( fp ); - - return 0; -} - -/* - -int main() -{ - char *file = "/Users/m.hansen/test.fna"; - FILE *fp; - - struct list *entries = NULL; - - fp = read_open( file ); - - fasta_get_entries( fp, &entries ); - - fasta_put_entries( entries); - close_stream( fp ); - - return 0; -} - -*/ diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index a380d3b..b5c5325 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -51,6 +51,7 @@ use Maasha::NCBI; use Maasha::GFF; use Maasha::TwoBit; use Maasha::Solid; +use Maasha::Solexa; use Maasha::SQL; use Maasha::Gwiki; @@ -70,6 +71,7 @@ require Exporter; use constant { SEQ_NAME => 0, SEQ => 1, + SCORE => 2, }; @@ -1886,7 +1888,7 @@ sub script_read_solexa # Returns nothing. - my ( $record, $file, $base_name, $data_in, $line, $num, @fields, @seqs, @scores, $i, $seq, $seq_count ); + my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i ); $options->{ "quality" } ||= 20; @@ -1898,34 +1900,26 @@ sub script_read_solexa foreach $file ( @{ $options->{ "files" } } ) { - $data_in = Maasha::Common::read_open( $file ); - $base_name = Maasha::Common::get_basename( $file ); - $base_name =~ s/\..*//; - - $seq_count = 0; + $data_in = Maasha::Common::read_open( $file ); - while ( $line = <$data_in> ) + while ( $entry = Maasha::Solexa::solexa_get_entry( $data_in ) ) { - @fields = split /:/, $line; - @seqs = split //, $fields[ 5 ]; - @scores = split / /, $fields[ -1 ]; + @seqs = split //, $entry->[ SEQ ]; + @scores = split /:/, $entry->[ SCORE ]; for ( $i = 0; $i < @scores; $i++ ) { $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; } - $seq = join "", @seqs; - - $record->{ "SEQ_NAME" } = sprintf( "%s_ID%08d", $base_name, $seq_count ); - $record->{ "SEQ" } = $seq; - $record->{ "SEQ_LEN" } = length $seq; + $record->{ "SEQ_NAME" } = $entry->[ SEQ_NAME ]; + $record->{ "SEQ" } = join "", @seqs; + $record->{ "SEQ_LEN" } = scalar @seqs; $record->{ "SCORE_MEAN" } = sprintf ( "%.2f", Maasha::Calc::mean( \@scores ) ); put_record( $record, $out ); goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - $seq_count++; $num++; } diff --git a/code_perl/Maasha/Solexa.pm b/code_perl/Maasha/Solexa.pm new file mode 100644 index 0000000..930fc88 --- /dev/null +++ b/code_perl/Maasha/Solexa.pm @@ -0,0 +1,106 @@ +package Maasha::Solexa; + + +# Copyright (C) 2007-2008 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines for manipulation Solid sequence files with di-base encoding. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub solexa_get_entry +{ + # Martin A. Hansen, August 2008 + + # Get the next Solexa entry form a file and returns + # a triple of [ seq_name, seq, score ]. + # We asume a Solexa entry consists of four lines: + + # @HWI-EAS157_20FFGAAXX:2:1:888:434 + # TTGGTCGCTCGCTCCGCGACCTCAGATCAGACGTGG + # +HWI-EAS157_20FFGAAXX:2:1:888:434 + # hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhKhe + + my ( $fh, # filehandle to Solexa file + ) = @_; + + # Returns a list. + + my ( $seq_header, $seq, $score_head, $score, @scores ); + + $seq_header = <$fh>; + $seq = <$fh>; + $score_header = <$fh>; + $score = <$fh>; + + return if not defined $score; + + chomp $seq_header; + chomp $seq; + chomp $score_header; + chomp $score; + + @scores = split //, $score; + + map { $_ = score_oct2dec( $_ ) } @scores; + + $seq_header =~ s/^@//; + + $score = join( ":", @scores ); + + return wantarray ? ( $seq_header, $seq, $score ) : [ $seq_header, $seq, $score ]; +} + + +sub score_oct2dec +{ + # Martin A. Hansen, August 2008 + + # Converts a Solexa score in octal format to decimal format. + + # http://maq.sourceforge.net/fastq.shtml + + my ( $char, # octal score + ) = @_; + + # Returns a float + + return 10 * log(1 + 10 ** ((ord( $char ) - 64) / 10.0)) / log(10); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +1; -- 2.39.2