]> git.donarmstrong.com Git - fastq-tools.git/blobdiff - src/fastq-match.c
Much simpler faster code for parsing fastq files.
[fastq-tools.git] / src / fastq-match.c
index ccfca73860501d6b1cf32915ff25632ca7230a46..1e4d9c705344af121b645869edaba736b78736a8 100644 (file)
@@ -1,21 +1,23 @@
 
-/* Smith-Waterman alignments against sequences within a fastq file.
+/*
+ * This file is part of fastq-tools.
+ *
+ * Copyright (c) 2011 by Daniel C. Jones <dcjones@cs.washington.edu>
+ *
+ * fastq-match :
+ * Smith-Waterman alignments against sequences within a fastq file.
  *
- * Daniel Jones <dcjones@cs.washington.edu>
  */
 
+
+#include "common.h"
+#include "parse.h"
+#include "sw.h"
 #include <stdlib.h>
+#include <string.h>
 #include <zlib.h>
 #include <getopt.h>
 
-#include "swsse2/blosum62.h"
-#include "swsse2/swsse2.h"
-#include "swsse2/matrix.h"
-#include "swsse2/swstriped.h"
-#include "kseq.h"
-KSEQ_INIT(gzFile, gzread)
-
-
 #if defined(MSDOS) || defined(OS2) || defined(WIN32) || defined(__CYGWIN__)
 #  include <fcntl.h>
 #  include <io.h>
@@ -25,71 +27,40 @@ KSEQ_INIT(gzFile, gzread)
 #endif
 
 
-static int help_flag;
+static const char* prog_name = "fastq-match";
 
 
 void print_help()
 {
-    fprintf( stderr
+    fprintf(stdout
 "fastq-match [OPTION]... QUERY [FILE]...\n"
 "Perform Smith-Waterman local alignment of a query sequence\n"
 "against each sequence in a fastq file.\n\n"
 "Options:\n"
 "  -h, --help              print this message\n"
+"  -V, --version           output version information and exit\n"
     );
 }
 
 
-void convert_sequence(unsigned char* s, int n)
-{
-    int i;
-    for (i = 0; i < n; i++) {
-        s[i] = (char)AMINO_ACID_VALUE[(int)s[i]];
-    }
-}
-
-
-void fastq_match(gzFile fin, FILE* fout,
-                 SwStripedData* sw_data,
-                 unsigned char* query, int n,
-                 SEARCH_OPTIONS* options)
+void fastq_match(FILE* fin, FILE* fout, sw_t* sw)
 {
     int score;
 
-    int gap_init  = -(options->gapInit + options->gapExt);
-    int gap_ext   = -options->gapExt;
-    int threshold = options->threshold;
-
-    kseq_t* seq;
-    seq = kseq_init(fin);
+    fastq_t* fqf = fastq_create(fin);
+    seq_t* seq = seq_create();
 
-    while (kseq_read(seq) >= 0) {
+    while (fastq_read(fqf, seq)) {
         fprintf(fout, "%s\t", seq->seq.s);
 
-        convert_sequence((unsigned char*)seq->seq.s, seq->seq.l);
-
-        score = swStripedByte(query, n,
-                              (unsigned char*)seq->seq.s, seq->seq.l,
-                              gap_init, gap_ext,
-                              sw_data->pvbQueryProf,
-                              sw_data->pvH1,
-                              sw_data->pvH2,
-                              sw_data->pvE,
-                              sw_data->bias);
-        if (score >= 255) {
-            score = swStripedWord(query, n,
-                                  (unsigned char*)seq->seq.s, seq->seq.l,
-                                  gap_init, gap_ext,
-                                  sw_data->pvbQueryProf,
-                                  sw_data->pvH1,
-                                  sw_data->pvH2,
-                                  sw_data->pvE);
-        }
+        fastq_sw_conv_seq((unsigned char*)seq->seq.s, seq->seq.n);
+        score = fastq_sw(sw, (unsigned char*)seq->seq.s, seq->seq.n);
 
         fprintf(fout, "%d\n", score);
     }
 
-    kseq_destroy(seq);
+    seq_free(seq);
+    fastq_free(fqf);
 }
 
 
@@ -101,33 +72,26 @@ int main(int argc, char* argv[])
 
     unsigned char* query;
     int query_len;
-    SwStripedData* sw_data;
-    signed char* mat = blosum62;
-    SEARCH_OPTIONS options;
 
-    options.gapInit   = -10;
-    options.gapExt    = -2;
-    options.threshold = -1;
+    sw_t* sw;
 
     FILE*  fin;
-    gzFile gzfin;
-
-    help_flag = 0;
 
     int opt;
     int opt_idx;
 
     static struct option long_options[] =
         { 
-          {"help", no_argument, &help_flag, 1},
+          {"help",       no_argument,       NULL, 'h'},
+          {"version",    no_argument,       NULL, 'V'},
           {0, 0, 0, 0}
         };
 
 
     while (1) {
-        opt = getopt_long(argc, argv, "h", long_options, &opt_idx);
+        opt = getopt_long(argc, argv, "hV", long_options, &opt_idx);
 
-        if( opt == -1 ) break;
+        if (opt == -1) break;
 
         switch (opt) {
             case 0:
@@ -137,8 +101,12 @@ int main(int argc, char* argv[])
                 break;
 
             case 'h':
-                help_flag = 1;
-                break;
+                print_help();
+                return 0;
+
+            case 'V':
+                print_version(stdout, prog_name);
+                return 0;
 
             case '?':
                 return 1;
@@ -148,10 +116,6 @@ int main(int argc, char* argv[])
         }
     }
 
-    if (help_flag) {
-        print_help();
-        return 0;
-    }
 
     if (optind >= argc) {
         fprintf(stderr, "A query sequence must be specified.\n");
@@ -160,20 +124,12 @@ int main(int argc, char* argv[])
 
     query = (unsigned char*)argv[optind++];
     query_len = strlen((char*)query);
-    convert_sequence(query, query_len);
+    fastq_sw_conv_seq(query, query_len);
 
-    sw_data = swStripedInit(query, query_len, mat);
+    sw = fastq_alloc_sw(query, query_len);
 
     if (optind >= argc || (argc - optind == 1 && strcmp(argv[optind],"-") == 0)) {
-        gzfin = gzdopen( fileno(stdin), "rb" );
-        if (gzfin == NULL) {
-            fprintf(stderr, "Malformed file 'stdin'.\n");
-            return 1;
-        }
-
-        fastq_match(gzfin, stdout, sw_data, query, query_len, &options);
-
-        gzclose(gzfin);
+        fastq_match(stdin, stdout, sw);
     }
     else {
         for (; optind < argc; optind++) {
@@ -183,19 +139,11 @@ int main(int argc, char* argv[])
                 continue;
             }
 
-            gzfin = gzdopen(fileno(fin), "rb");
-            if (gzfin == NULL) {
-                fprintf(stderr, "Malformed file '%s'.\n", argv[optind]);
-                continue;
-            }
-
-            fastq_match(gzfin, stdout, sw_data, query, query_len, &options);
-
-            gzclose(gzfin);
+            fastq_match(fin, stdout, sw);
         }
     }
 
-    swStripedComplete(sw_data);
+    fastq_free_sw(sw);
 
     return 0;
 }