rsem-v1.1.14b3

author Bo Li <bli@cs.wisc.edu>

Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)

committer Bo Li <bli@cs.wisc.edu>

Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)
author Bo Li <bli@cs.wisc.edu>
Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)
committer Bo Li <bli@cs.wisc.edu>
Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)
diff --git a/EM.cpp b/EM.cpp

index 8362f7d47dea60b762801d5877352d196bcf35f7..f9449a85cd3226a3f29356f10fa018e4ca908cba 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -102,7 +102,7 @@ void init(ReadReader<ReadType> **&readers, HitContainer<HitType> **&hitvs, doubl
                 indices[i] = new ReadIndex(readFs[i]);
         }
         for (int i = 0; i < nThreads; i++) {
-               readers[i] = new ReadReader<ReadType>(s, readFs);
+               readers[i] = new ReadReader<ReadType>(s, readFs, refs.hasPolyA(), mparams.seedLen); // allow calculation of calc_lq() function
                 readers[i]->setIndices(indices);
         }
  
diff --git a/PairedEndModel.h b/PairedEndModel.h

index c011ceea16d53757384e320ae7196251eea7b956..70d14c9de9a97680eb9a683715f887a1d2a1e16f 100644 (file)
--- a/PairedEndModel.h
+++ b/PairedEndModel.h
@@ -230,25 +230,25 @@ void PairedEndModel::estimateFromReads(const char* readFN) {
      for (int i = 0; i < 3; i++)
         if (N[i] > 0) {
                 genReadFileNames(readFN, i, read_type, s, readFs);
-               ReadReader<PairedEndRead> reader(s, readFs);
+               ReadReader<PairedEndRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
  
                 int cnt = 0;
                 while (reader.next(read)) {
                         SingleRead mate1 = read.getMate1();
                         SingleRead mate2 = read.getMate2();
                         
-                       if (!read.isLowQuality()) {
-                         mld->update(mate1.getReadLength(), 1.0);
-                         mld->update(mate2.getReadLength(), 1.0);
+                       if (!read.isLowQuality()) {
+                               mld->update(mate1.getReadLength(), 1.0);
+                               mld->update(mate2.getReadLength(), 1.0);
                           
-                         if (i == 0) {
-                           npro->updateC(mate1.getReadSeq());
-                           npro->updateC(mate2.getReadSeq());
-                         }
-                       }
-                       else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { 
-                         printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
-                       }
+                               if (i == 0) {
+                                       npro->updateC(mate1.getReadSeq());
+                                       npro->updateC(mate2.getReadSeq());
+                               }
+                       }
+                       else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) {
+                               printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen);
+                       }
  
                         ++cnt;
                         if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -407,39 +407,39 @@ void PairedEndModel::finishSimulation() {
  }
  
  void PairedEndModel::calcMW() {
-  assert(seedLen >= OLEN && mld->getMinL() >= seedLen);
-
-  memset(mw, 0, sizeof(double) * (M + 1));
-  mw[0] = 1.0;
-
-  for (int i = 1; i <= M; i++) { 
-    RefSeq& ref = refs->getRef(i);
-    int totLen = ref.getTotLen();
-    int fullLen = ref.getFullLen();
-    int end = std::min(fullLen, totLen - gld->getMinL() + 1);
-    double value = 0.0;
-    int minL, maxL;
-    int effL, pfpos;
-
-    //seedPos is fpos here
-    for (int seedPos = 0; seedPos < end; seedPos++) 
-      if (ref.getMask(seedPos)) {
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), totLen - seedPos);
-       pfpos = seedPos;
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         effL = std::min(fullLen, totLen - fragLen + 1); 
-         value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); 
+       assert(mld->getMinL() >= seedLen);
+
+       memset(mw, 0, sizeof(double) * (M + 1));
+       mw[0] = 1.0;
+
+       for (int i = 1; i <= M; i++) {
+               RefSeq& ref = refs->getRef(i);
+               int totLen = ref.getTotLen();
+               int fullLen = ref.getFullLen();
+               int end = std::min(fullLen, totLen - gld->getMinL() + 1);
+               double value = 0.0;
+               int minL, maxL;
+               int effL, pfpos;
+
+               //seedPos is fpos here
+               for (int seedPos = 0; seedPos < end; seedPos++)
+                       if (ref.getMask(seedPos)) {
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), totLen - seedPos);
+                               pfpos = seedPos;
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
+                               }
+                       }
+
+               mw[i] = 1.0 - value;
+
+               if (mw[i] < 1e-8) {
+                       //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+                       mw[i] = 0.0;
+               }
         }
-      }
-    
-    mw[i] = 1.0 - value;
-
-    if (mw[i] < 1e-8) { 
-      //fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
-      mw[i] = 0.0;
-    }
-  }
  }
  
  #endif /* PAIREDENDMODEL_H_ */
diff --git a/PairedEndQModel.h b/PairedEndQModel.h

index cbc5475588d19996154d892cc99af336b75ddbde..2c638abed103bcf4f7167b70b60221aa9eafed88 100644 (file)
--- a/PairedEndQModel.h
+++ b/PairedEndQModel.h
@@ -236,28 +236,28 @@ void PairedEndQModel::estimateFromReads(const char* readFN) {
      for (int i = 0; i < 3; i++)
         if (N[i] > 0) {
                 genReadFileNames(readFN, i, read_type, s, readFs);
-               ReadReader<PairedEndReadQ> reader(s, readFs);
+               ReadReader<PairedEndReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
  
                 int cnt = 0;
                 while (reader.next(read)) {
                         SingleReadQ mate1 = read.getMate1();
                         SingleReadQ mate2 = read.getMate2();
  
-                       if (!read.isLowQuality()) {
-                         mld->update(mate1.getReadLength(), 1.0);
-                         mld->update(mate2.getReadLength(), 1.0);
+                       if (!read.isLowQuality()) {
+                               mld->update(mate1.getReadLength(), 1.0);
+                               mld->update(mate2.getReadLength(), 1.0);
  
-                         qd->update(mate1.getQScore());
-                         qd->update(mate2.getQScore());
+                               qd->update(mate1.getQScore());
+                               qd->update(mate2.getQScore());
                           
-                         if (i == 0) {
-                           nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
-                           nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
-                         }
-                       }
-                       else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) { 
-                         printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
-                       }
+                               if (i == 0) {
+                                       nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
+                                       nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+                               }
+                       }
+                       else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) {
+                               printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen);
+                       }
  
                         ++cnt;
                         if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -427,39 +427,39 @@ void PairedEndQModel::finishSimulation() {
  
  
  void PairedEndQModel::calcMW() {
-  assert(seedLen >= OLEN && mld->getMinL() >= seedLen);
-
-  memset(mw, 0, sizeof(double) * (M + 1));
-  mw[0] = 1.0;
-
-  for (int i = 1; i <= M; i++) { 
-    RefSeq& ref = refs->getRef(i);
-    int totLen = ref.getTotLen();
-    int fullLen = ref.getFullLen();
-    int end = std::min(fullLen, totLen - gld->getMinL() + 1);
-    double value = 0.0;
-    int minL, maxL;
-    int effL, pfpos;
-
-    //seedPos is fpos here
-    for (int seedPos = 0; seedPos < end; seedPos++) 
-      if (ref.getMask(seedPos)) {
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), totLen - seedPos);
-       pfpos = seedPos;
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         effL = std::min(fullLen, totLen - fragLen + 1); 
-         value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen); 
-       }
-      }
+       assert(mld->getMinL() >= seedLen);
+
+       memset(mw, 0, sizeof(double) * (M + 1));
+       mw[0] = 1.0;
+
+       for (int i = 1; i <= M; i++) {
+               RefSeq& ref = refs->getRef(i);
+               int totLen = ref.getTotLen();
+               int fullLen = ref.getFullLen();
+               int end = std::min(fullLen, totLen - gld->getMinL() + 1);
+               double value = 0.0;
+               int minL, maxL;
+               int effL, pfpos;
+
+               //seedPos is fpos here
+               for (int seedPos = 0; seedPos < end; seedPos++)
+                       if (ref.getMask(seedPos)) {
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), totLen - seedPos);
+                               pfpos = seedPos;
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       value += gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen);
+                               }
+                       }
      
-    mw[i] = 1.0 - value;
+               mw[i] = 1.0 - value;
  
-    if (mw[i] < 1e-8) { 
-      //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
-      mw[i] = 0.0;
-    }
-  }
+               if (mw[i] < 1e-8) {
+                       //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+                       mw[i] = 0.0;
+               }
+       }
  }
  
  #endif /* PAIREDENDQMODEL_H_ */
diff --git a/PairedEndRead.h b/PairedEndRead.h

index 6504e610141e9db33d027bdb47cff33fb2af95a9..01d3d007a36669ac4292b777f97132ae9c747012 100644 (file)
--- a/PairedEndRead.h
+++ b/PairedEndRead.h
@@ -9,30 +9,28 @@
  #include "SingleRead.h"
  
  class PairedEndRead : public Read {
- public:
-  PairedEndRead() : mate1(), mate2() {}
-  PairedEndRead(const SingleRead& mate1, const SingleRead& mate2) {
-         this->mate1 = mate1;
-         this->mate2 = mate2;
-         this->name = mate1.getName();
-
-         calc_lq();
-  }
-
-  bool read(int argc, std::istream* argv[], int flags = 7);
-  void write(int argc, std::ostream* argv[]);
-
-  const SingleRead& getMate1() const { return mate1; }
-  const SingleRead& getMate2() const { return mate2; }
-  const SingleRead& getMate(int i) const {
-         if (i == 1) return mate1;
-         else return mate2;
-  }
-
- private:
-  SingleRead mate1, mate2;
-
-  void calc_lq();
+public:
+       PairedEndRead() : mate1(), mate2() {}
+       PairedEndRead(const SingleRead& mate1, const SingleRead& mate2) {
+               this->mate1 = mate1;
+               this->mate2 = mate2;
+               this->name = mate1.getName();
+       }
+
+       bool read(int argc, std::istream* argv[], int flags = 7);
+       void write(int argc, std::ostream* argv[]);
+
+       const SingleRead& getMate1() const { return mate1; }
+       const SingleRead& getMate2() const { return mate2; }
+       const SingleRead& getMate(int i) const {
+               if (i == 1) return mate1;
+               else return mate2;
+       }
+
+       void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+       SingleRead mate1, mate2;
  };
  
  bool PairedEndRead::read(int argc, std::istream* argv[], int flags) {
@@ -45,8 +43,6 @@ bool PairedEndRead::read(int argc, std::istream* argv[], int flags) {
         name = "";
         if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want
  
-       if (flags & 1) calc_lq();
-
         return success;
  }
  
@@ -59,9 +55,13 @@ void PairedEndRead::write(int argc, std::ostream *argv[]) {
         mate2.write(1, outMate2);
  }
  
-void PairedEndRead::calc_lq() {
-       low_quality = mate1.isLowQuality() && mate2.isLowQuality();
-       if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
+//calculate if this read is low quality
+void PairedEndRead::calc_lq(bool hasPolyA, int seedLen) {
+       low_quality = false;
+       mate1.calc_lq(hasPolyA, seedLen);
+       mate2.calc_lq(hasPolyA, seedLen);
+       if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true;
+       else low_quality = mate1.isLowQuality() && mate2.isLowQuality();
  }
  
  #endif
diff --git a/PairedEndReadQ.h b/PairedEndReadQ.h

index 36b703b059ed4bd4fad7a3eb873c216f49f22e19..751382074b9816134f90cde692507031f598f3fd 100644 (file)
--- a/PairedEndReadQ.h
+++ b/PairedEndReadQ.h
@@ -15,8 +15,6 @@ public:
                 this->mate1 = mate1;
                 this->mate2 = mate2;
                 this->name = mate1.getName();
-
-               calc_lq();
         }
  
         bool read(int argc, std::istream* argv[], int flags = 7);
@@ -29,10 +27,10 @@ public:
                 else return mate2;
         }
  
+       void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
  private:
         SingleReadQ mate1, mate2;
-
-       void calc_lq();
  };
  
  bool PairedEndReadQ::read(int argc, std::istream* argv[], int flags) {
@@ -45,8 +43,6 @@ bool PairedEndReadQ::read(int argc, std::istream* argv[], int flags) {
         name = "";
         if (flags & 4) { name = mate1.getName(); } //May chop 1 char later if we want
  
-       if (flags & 1) calc_lq();
-
         return success;
  }
  
@@ -59,9 +55,13 @@ void PairedEndReadQ::write(int argc, std::ostream* argv[]) {
         mate2.write(1, outMate2);
  }
  
-void PairedEndReadQ::calc_lq() {
-       low_quality = mate1.isLowQuality() && mate2.isLowQuality();
-       if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
+//calculate if this read is low quality
+void PairedEndReadQ::calc_lq(bool hasPolyA, int seedLen) {
+       low_quality = false;
+       mate1.calc_lq(hasPolyA, seedLen);
+       mate2.calc_lq(hasPolyA, seedLen);
+       if (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen) low_quality = true;
+       else low_quality = mate1.isLowQuality() && mate2.isLowQuality();
  }
  
  #endif /* PAIREDENDREADQ_H_ */
diff --git a/README.md b/README.md

index 7ef198ce38fbccaa85149267deb89d74ab69f5d3..a4c5ce504a0f284bb594e9a57807a17969a4fc51 100644 (file)
--- a/README.md
+++ b/README.md
@@ -26,9 +26,11 @@ levels from RNA-Seq data.  The new RSEM package (rsem-1.x) provides an
  user-friendly interface, supports threads for parallel computation of
  the EM algorithm, single-end and paired-end read data, quality scores,
  variable-length reads and RSPD estimation. It can also generate
-genomic-coordinate BAM files and UCSC wiggle files for visualization. In
-addition, it provides posterior mean and 95% credibility interval
-estimates for expression levels. 
+genomic-coordinate BAM files and UCSC wiggle files for
+visualization. In addition, it provides posterior mean and 95%
+credibility interval estimates for expression levels. For
+visualization, it can also generate transcript-coordinate BAM files
+and visualize them and also models learned.
  
  ## <a name="compilation"></a> Compilation & Installation
  
@@ -109,18 +111,24 @@ aligner's indices.
  
  ### III. Visualization
  
-RSEM contains a version of samtools in the 'sam' subdirectory. When
-users specify the --out-bam option RSEM will produce three files:
-'sample_name.bam', the unsorted BAM file, 'sample_name.sorted.bam' and
-'sample_name.sorted.bam.bai' the sorted BAM file and indices generated
-by the samtools included.
+RSEM contains a version of samtools in the 'sam' subdirectory. RSEM
+will always produce three files:'sample_name.transcript.bam', the
+unsorted BAM file, 'sample_name.transcript.sorted.bam' and
+'sample_name.transcript.sorted.bam.bai' the sorted BAM file and
+indices generated by the samtools included. All three files are in
+transcript coordinates. When users specify the --output-genome-bam
+option RSEM will produce three files: 'sample_name.genome.bam', the
+unsorted BAM file, 'sample_name.genome.sorted.bam' and
+'sample_name.genome.sorted.bam.bai' the sorted BAM file and indices
+generated by the samtools included. All these files are in genomic
+coordinates.
  
  #### a) Generating a UCSC Wiggle file
  
  A wiggle plot representing the expected number of reads overlapping
-each position in the genome can be generated from the sorted BAM file
-output.  To generate the wiggle plot, run the 'rsem-bam2wig' program on
-the 'sample_name.sorted.bam' file.
+each position in the genome can be generated from the sorted genome
+BAM file output.  To generate the wiggle plot, run the 'rsem-bam2wig'
+program on the 'sample_name.genome.sorted.bam' file.
  
  Usage:    
  
@@ -134,16 +142,26 @@ wiggle_name: the name the user wants to use for this wiggle plot
  
  Refer to the [UCSC custom track help page](http://genome.ucsc.edu/goldenPath/help/customTrack.html).
  
-#### c) Visualize the model learned by RSEM
+#### c) Generating Transcript Wiggle Plots
+
+To generate transcript wiggle plots, you should run the
+'rsem-plot-transcript-wiggles' program.  Run 
+
+    rsem-plot-transcript-wiggles --help
+
+to get usage information or visit the [rsem-plot-transcript-wiggles
+documentation page](http://deweylab.biostat.wisc.edu/rsem/rsem-plot-transcript-wiggles.html).
+
+#### d) Visualize the model learned by RSEM
  
  RSEM provides an R script, 'rsem-plot-model', for visulazing the model learned.
  
  Usage:
      
-    rsem-plot-model sample_name outF
+    rsem-plot-model sample_name output_plot_file
  
  sample_name: the name of the sample analyzed    
-outF: the file name for plots generated from the model. It is a pdf file    
+output_plot_file: the file name for plots generated from the model. It is a pdf file    
  
  The plots generated depends on read type and user configuration. It
  may include fragment length distribution, mate length distribution,
@@ -164,23 +182,28 @@ Histogram of reads with different number of alignments: x-axis is the number of
   
  ## <a name="example"></a> Example
  
-Suppose we download the mouse genome from UCSC Genome Browser.  We will
-use a reference_name of 'mm9'.  We have a FASTQ-formatted file,
-'mmliver.fq', containing single-end reads from one sample, which we call
-'mmliver_single_quals'.  We want to estimate expression values by using
-the single-end model with a fragment length distribution. We know that
-the fragment length distribution is approximated by a normal
-distribution with a mean of 150 and a standard deviation of 35. We wish
-to generate 95% credibility intervals in addition to maximum likelihood
-estimates.  RSEM will be allowed 1G of memory for the credibility
-interval calculation.  We will visualize the probabilistic read mappings
-generated by RSEM.
+Suppose we download the mouse genome from UCSC Genome Browser.  We
+will use a reference_name of 'mm9'.  We have a FASTQ-formatted file,
+'mmliver.fq', containing single-end reads from one sample, which we
+call 'mmliver_single_quals'.  We want to estimate expression values by
+using the single-end model with a fragment length distribution. We
+know that the fragment length distribution is approximated by a normal
+distribution with a mean of 150 and a standard deviation of 35. We
+wish to generate 95% credibility intervals in addition to maximum
+likelihood estimates.  RSEM will be allowed 1G of memory for the
+credibility interval calculation.  We will visualize the probabilistic
+read mappings generated by RSEM on UCSC genome browser. We will
+generate a list of genes' transcript wiggle plots in 'output.pdf'. The
+list is 'gene_ids.txt'. We will visualize the models learned in
+'mmliver_single_quals.models.pdf'
  
  The commands for this scenario are as follows:
  
      rsem-prepare-reference --gtf mm9.gtf --mapping knownIsoforms.txt --bowtie-path /sw/bowtie /data/mm9 /ref/mm9
-    rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --out-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mm9 mmliver_single_quals
+    rsem-calculate-expression --bowtie-path /sw/bowtie --phred64-quals --fragment-length-mean 150.0 --fragment-length-sd 35.0 -p 8 --output-genome-bam --calc-ci --memory-allocate 1024 /data/mmliver.fq /ref/mm9 mmliver_single_quals
      rsem-bam2wig mmliver_single_quals.sorted.bam mmliver_single_quals.sorted.wig mmliver_single_quals
+    rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf 
+    rsem-plot-model mmliver_single_quals mmliver_single_quals.models.pdf
  
  ## <a name="simulation"></a> Simulation
  
diff --git a/ReadReader.h b/ReadReader.h

index f1bd1f59a8395c5938a034199e7cd48b70d6a9fb..d244efa4236807871dcdda49c4b1f942d224f3ef 100644 (file)
--- a/ReadReader.h
+++ b/ReadReader.h
@@ -19,8 +19,8 @@
  template<class ReadType>
  class ReadReader {
  public:
-       ReadReader() { s = 0; indices = NULL; arr = NULL; }
-       ReadReader(int s, char readFs[][STRLEN]);
+       ReadReader() { s = 0; indices = NULL; arr = NULL; hasPolyA = false; seedLen = -1; }
+       ReadReader(int s, char readFs[][STRLEN], bool hasPolyA = false, int seedLen = -1);
         ~ReadReader();
  
         void setIndices(ReadIndex** indices) {
@@ -31,7 +31,9 @@ public:
         void reset();
  
         bool next(ReadType& read, int flags = 7) {
-               return read.read(s, (std::istream**)arr, flags);
+               bool success = read.read(s, (std::istream**)arr, flags);
+               if (success && seedLen > 0) { read.calc_lq(hasPolyA, seedLen); }
+               return success;
         }
  
  private:
@@ -39,10 +41,13 @@ private:
         ReadIndex **indices;
         std::ifstream** arr;
         std::streampos *locations;
+
+       bool hasPolyA;
+       int seedLen;
  };
  
  template<class ReadType>
-ReadReader<ReadType>::ReadReader(int s, char readFs[][STRLEN]) {
+ReadReader<ReadType>::ReadReader(int s, char readFs[][STRLEN], bool hasPolyA, int seedLen) {
         assert(s > 0);
         this->s = s;
         arr = new std::ifstream*[s];
@@ -53,6 +58,8 @@ ReadReader<ReadType>::ReadReader(int s, char readFs[][STRLEN]) {
                 if (!arr[i]->is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", readFs[i]); exit(-1); }
                 locations[i] = arr[i]->tellg();
         }
+       this->hasPolyA = hasPolyA;
+       this->seedLen = seedLen;
  }
  
  template<class ReadType>
diff --git a/RefSeq.h b/RefSeq.h

index 0e7f2caba0dac2df07b8ad255373d79b9f70de94..88b149e0068e8aa342e14aedf126dac79deac136 100644 (file)
--- a/RefSeq.h
+++ b/RefSeq.h
@@ -10,132 +10,131 @@
  
  //Each Object can only be used once
  class RefSeq {
- public:
-
-  RefSeq() {
-    fullLen = totLen = 0;
-    name = ""; seq = "";
-    fmasks.clear();
+public:
+       RefSeq() {
+               fullLen = totLen = 0;
+               name = ""; seq = "";
+               fmasks.clear();
+       }
+
+       //Constructor , seq : the forward strand of the reference
+       //tag does not contain ">"
+       //polyALen : length of polyA tail we add
+       RefSeq(const std::string& name, const std::string& seq, int polyALen) {
+               fullLen = seq.length();
+               totLen = fullLen + polyALen;
+               this->name = name;
+               this->seq = seq;
+               this->seq.append(polyALen, 'A');
+
+               assert(fullLen > 0 && totLen >= fullLen);
+
+               int len = (fullLen - 1) / NBITS + 1;
+               fmasks.assign(len, 0);
+               // set mask if poly(A) tail is added
+               if (polyALen > 0) {
+                       for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i);
+               }
    }
  
-  //Constructor , seq : the forward strand of the reference
-  //tag does not contain ">"
-  //polyALen : length of polyA tail we add
-  RefSeq(const std::string& name, const std::string& seq, int polyALen) {
-    fullLen = seq.length();
-    totLen = fullLen + polyALen;
-    this->name = name;
-    this->seq = seq;
-    this->seq.append(polyALen, 'A');
-
-    assert(fullLen > 0 && totLen >= fullLen);
-
-    int len = (fullLen - 1) / NBITS + 1;
-    fmasks.clear(); fmasks.resize(len, 0);
-    // ask read to be at least OLEN long!
-    for (int i = std::max(fullLen - OLEN + 1, 0); i < fullLen; i++) setMask(i);
-  }
+       RefSeq(const RefSeq& o) {
+               fullLen = o.fullLen;
+               totLen = o.totLen;
+               name = o.name;
+               seq = o.seq;
+               fmasks = o.fmasks;
+       }
  
-  RefSeq(const RefSeq& o) {
-    fullLen = o.fullLen;
-    totLen = o.totLen;
-    name = o.name;
-    seq = o.seq;
-    fmasks = o.fmasks;
-  }
+       RefSeq& operator= (const RefSeq &rhs) {
+               if (this != &rhs) {
+                       fullLen = rhs.fullLen;
+                       totLen = rhs.totLen;
+                       name = rhs.name;
+                       seq = rhs.seq;
+                       fmasks = rhs.fmasks;
+               }
  
-  RefSeq& operator= (const RefSeq &rhs) {
-    if (this != &rhs) {
-      fullLen = rhs.fullLen;
-      totLen = rhs.totLen;
-      name = rhs.name;
-      seq = rhs.seq;
-      fmasks = rhs.fmasks;
-    }
+               return *this;
+       }
  
-    return *this;
-  }
+       ~RefSeq() {}
  
-  ~RefSeq() {
-  }
+       bool read(std::ifstream&, int  = 0);
+       void write(std::ofstream&);
  
-  bool read(std::ifstream&, int  = 0);
-  void write(std::ofstream&);
+       int getFullLen() const { return fullLen; }
  
-  int getFullLen() const { return fullLen; }
+       int getTotLen() const { return totLen; }
  
-  int getTotLen() const { return totLen; }
+       const std::string& getName() const { return name; }
  
-  const std::string& getName() const { return name; }
+       std::string getSeq() const { return seq; }
  
-  std::string getSeq() const { return seq; }
-
-  std::string getRSeq() const { 
-    std::string rseq = "";
-    for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i])));
-    return rseq;
-  }
-
-  //get the sequence  dir 0 : + 1 : -
-  std::string getSeq(int dir) const {
-    return (dir == 0 ? getSeq() : getRSeq());
-  }
+       std::string getRSeq() const {
+               std::string rseq = "";
+               for (int i = totLen - 1; i >= 0; i--) rseq.push_back(getCharacter(get_rbase_id(seq[i])));
+               return rseq;
+       }
  
+       //get the sequence  dir 0 : + 1 : -
+       std::string getSeq(int dir) const {
+               return (dir == 0 ? getSeq() : getRSeq());
+       }
    
-  int get_id(int pos, int dir) const {
-    assert(pos >= 0 && pos < totLen);
-    return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1]));
-  }
-
-  bool getMask(int seedPos) const {
-    assert(seedPos >= 0 && seedPos < totLen);
-    return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS];
-  }
-
-  void setMask(int seedPos) { 
-    assert(seedPos >= 0 && seedPos < totLen);
-    fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS];
-  }
+       int get_id(int pos, int dir) const {
+               assert(pos >= 0 && pos < totLen);
+               return (dir == 0 ? get_base_id(seq[pos]) : get_rbase_id(seq[totLen - pos - 1]));
+       }
+
+       bool getMask(int seedPos) const {
+               assert(seedPos >= 0 && seedPos < totLen);
+               return fmasks[seedPos / NBITS] & mask_codes[seedPos % NBITS];
+       }
+
+       void setMask(int seedPos) {
+               assert(seedPos >= 0 && seedPos < totLen);
+               fmasks[seedPos / NBITS] |= mask_codes[seedPos % NBITS];
+       }
    
- private:
-  int fullLen; // fullLen : the original length of an isoform
-  int totLen; // totLen : the total length, included polyA tails, if any
-  std::string name; // the tag
-  std::string seq; // the raw sequence, in forward strand
-  std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit 
+private:
+       int fullLen; // fullLen : the original length of an isoform
+       int totLen; // totLen : the total length, included polyA tails, if any
+       std::string name; // the tag
+       std::string seq; // the raw sequence, in forward strand
+       std::vector<unsigned int> fmasks; // record masks for forward strand, each position occupies 1 bit
  };
  
  //internal read; option 0 : read all 1 : do not read seqences
  bool RefSeq::read(std::ifstream& fin, int option) {
-  std::string line;
+       std::string line;
  
-  if (!(fin>>fullLen>>totLen)) return false;
-  assert(fullLen > 0 && totLen >= fullLen);
-  getline(fin, line);
-  if (!getline(fin, name)) return false;
-  if (!getline(fin, seq)) return false;
-  
-  int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits
-  fmasks.resize(len, 0); 
-  for (int i = 0; i < len; i++) 
-    if (!(fin>>fmasks[i])) return false;
-  getline(fin, line);
+       if (!(fin>>fullLen>>totLen)) return false;
+       assert(fullLen > 0 && totLen >= fullLen);
+       getline(fin, line);
+       if (!getline(fin, name)) return false;
+       if (!getline(fin, seq)) return false;
+
+       int len = (fullLen - 1) / NBITS + 1; // assume each cell contains NBITS bits
+       fmasks.assign(len, 0);
+       for (int i = 0; i < len; i++)
+           if (!(fin>>fmasks[i])) return false;
+       getline(fin, line);
  
-  assert(option == 0 || option == 1);
-  if (option == 1) { seq = ""; }
+       assert(option == 0 || option == 1);
+       if (option == 1) { seq = ""; }
  
-  return true;
+       return true;
  }
  
  //write to file in "internal" format
  void RefSeq::write(std::ofstream& fout) {
-  fout<<fullLen<<" "<<totLen<<std::endl;
-  fout<<name<<std::endl;
-  fout<<seq<<std::endl;
-  
-  int len = fmasks.size();
-  for (int i = 0; i < len - 1; i++) fout<<fmasks[i]<<" ";
-  fout<<fmasks[len - 1]<<std::endl;
+       fout<<fullLen<<" "<<totLen<<std::endl;
+       fout<<name<<std::endl;
+       fout<<seq<<std::endl;
+
+       int len = fmasks.size();
+       for (int i = 0; i < len - 1; i++) fout<<fmasks[i]<<" ";
+       fout<<fmasks[len - 1]<<std::endl;
  }
  
  #endif
diff --git a/Refs.h b/Refs.h

index 67c5d57a349ab589a479b6ac0608058e86cb973e..008d2755e3aa56862566363523028f330de69c5b 100644 (file)
--- a/Refs.h
+++ b/Refs.h
@@ -20,6 +20,7 @@ class Refs {
    Refs() {
      M = 0;
      seqs.clear();
+    has_polyA = false;
    }
  
    ~Refs() {}
@@ -36,6 +37,8 @@ class Refs {
  
    std::vector<RefSeq>& getRefs() { return seqs; } // may be slow, for copying the whole thing
  
+  bool hasPolyA() { return has_polyA; } // if any of sequence has poly(A) tail added
+
    //lim : >=0 If mismatch > lim , return; -1 find all mismatches
    int countMismatch(const std::string& seq, int pos, const std::string& readseq, int LEN, int lim = -1) {
      int nMis = 0; // number of mismatches
@@ -73,7 +76,7 @@ class Refs {
   private:
    int M; // # of isoforms, id starts from 1
    std::vector<RefSeq> seqs;  // reference sequences, starts from 1; 0 is for noise gene
-
+  bool has_polyA; // if at least one sequence has polyA added, the value is true; otherwise, the value is false
  };
  
  //inpF in fasta format
@@ -87,6 +90,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
    seqs.push_back(RefSeq()); // noise isoform
  
    M = 0;
+  has_polyA = false;
  
    fin.open(inpF);
    if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", inpF); exit(-1); }
@@ -103,6 +107,7 @@ void Refs::makeRefs(char *inpF, RefSeqPolicy& policy, PolyARules& rules) {
      }
      ++M;
      seqs.push_back(RefSeq(tag, policy.convert(rawseq), rules.getLenAt(tag)));
+    has_polyA = has_polyA || seqs[M].getFullLen() < seqs[M].getTotLen();
    }
    fin.close();
  
@@ -121,6 +126,7 @@ void Refs::loadRefs(char *inpF, int option) {
    seqs.push_back(RefSeq());
  
    M = 0;
+  has_polyA = false;
  
    bool success;
    do {
@@ -128,6 +134,7 @@ void Refs::loadRefs(char *inpF, int option) {
      if (success) {
         seqs.push_back(seq);
          ++M;
+       has_polyA = has_polyA || seq.getFullLen() < seq.getTotLen();
      }
    } while (success);
  
diff --git a/SingleModel.h b/SingleModel.h

index 97ffd29c8ce9b7d499d99979bf0d51546cb17ab7..59db6ec69d30ce228535312f9cd91da476be2cae 100644 (file)
--- a/SingleModel.h
+++ b/SingleModel.h
@@ -269,15 +269,17 @@ void SingleModel::estimateFromReads(const char* readFN) {
         for (int i = 0; i < 3; i++)
                 if (N[i] > 0) {
                         genReadFileNames(readFN, i, read_type, s, readFs);
-                       ReadReader<SingleRead> reader(s, readFs);
+                       ReadReader<SingleRead> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
  
                         int cnt = 0;
                         while (reader.next(read)) {
                                 if (!read.isLowQuality()) {
-                                 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
-                                 if (i == 0) { npro->updateC(read.getReadSeq()); }
+                                       mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+                                       if (i == 0) { npro->updateC(read.getReadSeq()); }
+                               }
+                               else if (verbose && read.getReadLength() < seedLen) {
+                                       printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
                                 }
-                               else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
  
                                 ++cnt;
                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -443,68 +445,67 @@ void SingleModel::finishSimulation() {
  }
  
  void SingleModel::calcMW() {
-  double probF, probR;
+       double probF, probR;
  
-  assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
-  
-  memset(mw, 0, sizeof(double) * (M + 1));
-  mw[0] = 1.0;
-  
+       assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
    
-  probF = ori->getProb(0);
-  probR = ori->getProb(1);
-  
-  for (int i = 1; i <= M; i++) { 
-    RefSeq& ref = refs->getRef(i);
-    int totLen = ref.getTotLen();
-    int fullLen = ref.getFullLen();
-    double value = 0.0;
-    int minL, maxL;
-    int effL, pfpos;
-    int end = std::min(fullLen, totLen - seedLen + 1);
-    double factor;
-
-    for (int seedPos = 0; seedPos < end; seedPos++) 
-      if (ref.getMask(seedPos)) {
-       //forward
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), totLen - seedPos);
-       pfpos = seedPos;
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         effL = std::min(fullLen, totLen - fragLen + 1); 
-         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-         value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; 
-       }
-       //reverse
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), seedPos + seedLen);
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         pfpos = seedPos - (fragLen - seedLen);
-         effL = std::min(fullLen, totLen - fragLen + 1);
-         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
-       }
-      }
-    
-    //for reverse strand masking
-    for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
-      minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
-      maxL = std::min(gld->getMaxL(), seedPos + seedLen);
-      for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-       pfpos = seedPos - (fragLen - seedLen);
-       effL = std::min(fullLen, totLen - fragLen + 1);
-       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-       value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
-      }
-    }
+       memset(mw, 0, sizeof(double) * (M + 1));
+       mw[0] = 1.0;
+
+       probF = ori->getProb(0);
+       probR = ori->getProb(1);
+
+       for (int i = 1; i <= M; i++) {
+               RefSeq& ref = refs->getRef(i);
+               int totLen = ref.getTotLen();
+               int fullLen = ref.getFullLen();
+               double value = 0.0;
+               int minL, maxL;
+               int effL, pfpos;
+               int end = std::min(fullLen, totLen - seedLen + 1);
+               double factor;
+
+               for (int seedPos = 0; seedPos < end; seedPos++)
+                       if (ref.getMask(seedPos)) {
+                               //forward
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), totLen - seedPos);
+                               pfpos = seedPos;
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                                       value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                               }
+                               //reverse
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       pfpos = seedPos - (fragLen - seedLen);
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                                       value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                               }
+                       }
      
-    mw[i] = 1.0 - value;
+               //for reverse strand masking
+               for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
+                       minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
+                       maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+                       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                               pfpos = seedPos - (fragLen - seedLen);
+                               effL = std::min(fullLen, totLen - fragLen + 1);
+                               factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                               value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                       }
+               }
+
+               mw[i] = 1.0 - value;
  
-    if (mw[i] < 1e-8) { 
-      //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
-      mw[i] = 0.0;
-    }
-  }
+               if (mw[i] < 1e-8) {
+                       //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+                       mw[i] = 0.0;
+               }
+       }
  }
  
  #endif /* SINGLEMODEL_H_ */
diff --git a/SingleQModel.h b/SingleQModel.h

index 5d4191a809d39c454e1bfea7805b12f5fde990c0..786d6476383ec41fb06606f74c49099786f693e7 100644 (file)
--- a/SingleQModel.h
+++ b/SingleQModel.h
@@ -279,16 +279,18 @@ void SingleQModel::estimateFromReads(const char* readFN) {
         for (int i = 0; i < 3; i++)
                 if (N[i] > 0) {
                         genReadFileNames(readFN, i, read_type, s, readFs);
-                       ReadReader<SingleReadQ> reader(s, readFs);
+                       ReadReader<SingleReadQ> reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function
  
                         int cnt = 0;
                         while (reader.next(read)) {
                                 if (!read.isLowQuality()) {
-                                 mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
-                                 qd->update(read.getQScore());
-                                 if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+                                       mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+                                       qd->update(read.getQScore());
+                                       if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+                               }
+                               else if (verbose && read.getReadLength() < seedLen) {
+                                       printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen);
                                 }
-                               else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
  
                                 ++cnt;
                                 if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -464,67 +466,67 @@ void SingleQModel::finishSimulation() {
  }
  
  void SingleQModel::calcMW() {
-  double probF, probR;
-
-  assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
-
-  memset(mw, 0, sizeof(double) * (M + 1));
-  mw[0] = 1.0;
-
-  probF = ori->getProb(0);
-  probR = ori->getProb(1);
-
-  for (int i = 1; i <= M; i++) { 
-    RefSeq& ref = refs->getRef(i);
-    int totLen = ref.getTotLen();
-    int fullLen = ref.getFullLen();
-    double value = 0.0;
-    int minL, maxL;
-    int effL, pfpos;
-    int end = std::min(fullLen, totLen - seedLen + 1);
-    double factor;
-
-    for (int seedPos = 0; seedPos < end; seedPos++) 
-      if (ref.getMask(seedPos)) {
-       //forward
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), totLen - seedPos);
-       pfpos = seedPos;
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         effL = std::min(fullLen, totLen - fragLen + 1); 
-         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-         value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor; 
-       }
-       //reverse
-       minL = gld->getMinL();
-       maxL = std::min(gld->getMaxL(), seedPos + seedLen);
-       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-         pfpos = seedPos - (fragLen - seedLen);
-         effL = std::min(fullLen, totLen - fragLen + 1);
-         factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-         value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+       double probF, probR;
+
+       assert((mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
+
+       memset(mw, 0, sizeof(double) * (M + 1));
+       mw[0] = 1.0;
+
+       probF = ori->getProb(0);
+       probR = ori->getProb(1);
+
+       for (int i = 1; i <= M; i++) {
+               RefSeq& ref = refs->getRef(i);
+               int totLen = ref.getTotLen();
+               int fullLen = ref.getFullLen();
+               double value = 0.0;
+               int minL, maxL;
+               int effL, pfpos;
+               int end = std::min(fullLen, totLen - seedLen + 1);
+               double factor;
+
+               for (int seedPos = 0; seedPos < end; seedPos++)
+                       if (ref.getMask(seedPos)) {
+                               //forward
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), totLen - seedPos);
+                               pfpos = seedPos;
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                                       value += probF * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                               }
+                               //reverse
+                               minL = gld->getMinL();
+                               maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+                               for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                                       pfpos = seedPos - (fragLen - seedLen);
+                                       effL = std::min(fullLen, totLen - fragLen + 1);
+                                       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                                       value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                               }
+                       }
+
+               //for reverse strand masking
+               for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
+                       minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
+                       maxL = std::min(gld->getMaxL(), seedPos + seedLen);
+                       for (int fragLen = minL; fragLen <= maxL; fragLen++) {
+                               pfpos = seedPos - (fragLen - seedLen);
+                               effL = std::min(fullLen, totLen - fragLen + 1);
+                               factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen));
+                               value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
+                       }
+               }
+
+               mw[i] = 1.0 - value;
+
+               if (mw[i] < 1e-8) {
+                       //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
+                       mw[i] = 0.0;
+               }
         }
-      }
-    
-    //for reverse strand masking
-    for (int seedPos = end; seedPos <= totLen - seedLen; seedPos++) {
-      minL = std::max(gld->getMinL(), seedPos + seedLen - fullLen + 1);
-      maxL = std::min(gld->getMaxL(), seedPos + seedLen);
-      for (int fragLen = minL; fragLen <= maxL; fragLen++) {
-       pfpos = seedPos - (fragLen - seedLen);
-       effL = std::min(fullLen, totLen - fragLen + 1);
-       factor = (mld == NULL ? 1.0 : mld->getAdjustedCumulativeProb(std::min(mld->getMaxL(), fragLen), fragLen)); 
-       value += probR * gld->getAdjustedProb(fragLen, totLen) * rspd->getAdjustedProb(pfpos, effL, fullLen) * factor;
-      }
-    }
-    
-    mw[i] = 1.0 - value;
-
-    if (mw[i] < 1e-8) { 
-      //      fprintf(stderr, "Warning: %dth reference sequence is masked for almost all positions!\n", i);
-      mw[i] = 0.0;
-    }
-  }
  }
  
  #endif /* SINGLEQMODEL_H_ */
diff --git a/SingleRead.h b/SingleRead.h

index 3c61242202ab3f4bfaad173787767417b04858f3..8ea4eec510e506ea9e1728e21766d96969e2b308 100644 (file)
--- a/SingleRead.h
+++ b/SingleRead.h
@@ -12,77 +12,81 @@
  #include "Read.h"
  
  class SingleRead : public Read {
- public:
-  SingleRead() { readseq = ""; len = 0; }
-  SingleRead(const std::string& name, const std::string& readseq) {
-         this->name = name;
-         this->readseq = readseq;
-         this->len = readseq.length();
-         calc_lq();
-  }
-
-  bool read(int argc, std::istream* argv[], int flags = 7);
-  void write(int argc, std::ostream* argv[]);
-
-  const int getReadLength() const { return len; /*readseq.length();*/ } // If need memory and .length() are guaranteed O(1), use statement in /* */
-  const std::string& getReadSeq() const { return readseq; }
-
- private:
-  int len; // read length
-  std::string readseq; // read sequence
-
-  void calc_lq();
-};
+public:
+       SingleRead() { readseq = ""; len = 0; }
+       SingleRead(const std::string& name, const std::string& readseq) {
+               this->name = name;
+               this->readseq = readseq;
+               this->len = readseq.length();
+       }
  
-//If return false, you should not trust the value of any member
-bool SingleRead::read(int argc, std::istream* argv[], int flags) {
-  std::string line;
+       bool read(int argc, std::istream* argv[], int flags = 7);
+       void write(int argc, std::ostream* argv[]);
  
-  assert(argc == 1);
-  if (!getline((*argv[0]), line)) return false;
-  if (line[0] != '>') { fprintf(stderr, "Read file does not look like a FASTA file!"); exit(-1); }
-  name = "";
-  if (flags & 4) { name = line.substr(1); }
-  if (!getline((*argv[0]), readseq)) return false;
-  len = readseq.length(); // set read length
-  if (!(flags & 1)) { readseq = ""; }
+       const int getReadLength() const { return len; /*readseq.length();*/ } // If need memory and .length() are guaranteed O(1), use statement in /* */
+       const std::string& getReadSeq() const { return readseq; }
  
-  if (flags & 1) calc_lq();
+       void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
  
-  return true;
+private:
+       int len; // read length
+       std::string readseq; // read sequence
+};
+
+//If return false, you should not trust the value of any member
+bool SingleRead::read(int argc, std::istream* argv[], int flags) {
+       std::string line;
+
+       assert(argc == 1);
+       if (!getline((*argv[0]), line)) return false;
+       if (line[0] != '>') { fprintf(stderr, "Read file does not look like a FASTA file!"); exit(-1); }
+       name = "";
+       if (flags & 4) { name = line.substr(1); }
+       if (!getline((*argv[0]), readseq)) return false;
+       len = readseq.length(); // set read length
+       if (!(flags & 1)) { readseq = ""; }
+
+       return true;
  }
  
  void SingleRead::write(int argc, std::ostream* argv[]) {
-  assert(argc == 1);
-  (*argv[0])<<">"<<name<<std::endl<<readseq<<std::endl;
+       assert(argc == 1);
+       (*argv[0])<<">"<<name<<std::endl<<readseq<<std::endl;
  }
  
-void SingleRead::calc_lq() {
-  int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
-  int threshold_1, threshold_2;
-
-  if (len < OLEN) { low_quality = true; return; }
-
-  threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
-  threshold_2 = (OLEN - 1) / 2 + 1;
-  for (int i = 0; i < len; i++) {
-    if (readseq[i] == 'A') { 
-      ++numA;
-      if (i < OLEN) ++numAO;
-    }
-    if (readseq[i] == 'T') {
-      ++numT;
-      if (i >= len - OLEN) ++numTO; 
-    }
-  }
+//calculate if this read is low quality
+void SingleRead::calc_lq(bool hasPolyA, int seedLen) {
+       low_quality = false;
+       if (len < seedLen) { low_quality = true; return; }
+
+       // if no polyA, no need to do the following calculation
+       if (!hasPolyA) return;
+
+       assert(readseq != "");
+
+       int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
+       int threshold_1, threshold_2;
+
+       threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
+       threshold_2 = (OLEN - 1) / 2 + 1;
+       for (int i = 0; i < len; i++) {
+               if (readseq[i] == 'A') {
+                       ++numA;
+                       if (i < OLEN) ++numAO;
+               }
+               if (readseq[i] == 'T') {
+                       ++numT;
+                       if (i >= len - OLEN) ++numTO;
+               }
+       }
    
-  if (numA >= threshold_1) {
-    low_quality = (numAO >= threshold_2);
-  }
-  else if (numT >= threshold_1) {
-    low_quality = (numTO >= threshold_2);
-  }
-  else low_quality = false;
+       if (numA >= threshold_1) {
+               low_quality = (numAO >= threshold_2);
+       }
+       else if (numT >= threshold_1) {
+               low_quality = (numTO >= threshold_2);
+       }
+       else low_quality = false;
  }
  
  #endif
diff --git a/SingleReadQ.h b/SingleReadQ.h

index 976637fc269d79fa484cb0033421e7f7a0769c49..8e7d898bdf8c0810c26f17a0403f0e2603463d4e 100644 (file)
--- a/SingleReadQ.h
+++ b/SingleReadQ.h
@@ -12,50 +12,46 @@
  #include "Read.h"
  
  class SingleReadQ : public Read {
- public:
-  SingleReadQ() { readseq = qscore = ""; len = 0; }
-  SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) {
-         this->name = name;
-         this->readseq = readseq;
-         this->qscore = qscore;
-         this->len = readseq.length();
-
-         calc_lq();
-  }
-
-  bool read(int argc, std::istream* argv[], int flags = 7);
-  void write(int argc, std::ostream* argv[]);
-
-  int getReadLength() const { return len; }
-  const std::string& getReadSeq() const { return readseq; }
-  const std::string& getQScore() const { return qscore; }
-
- private:
-  int len; // read length
-  std::string readseq, qscore; // qscore : quality scores
-
-  void calc_lq();
+public:
+       SingleReadQ() { readseq = qscore = ""; len = 0; }
+       SingleReadQ(const std::string& name, const std::string& readseq, const std::string& qscore) {
+               this->name = name;
+               this->readseq = readseq;
+               this->qscore = qscore;
+               this->len = readseq.length();
+       }
+
+       bool read(int argc, std::istream* argv[], int flags = 7);
+       void write(int argc, std::ostream* argv[]);
+
+       int getReadLength() const { return len; }
+       const std::string& getReadSeq() const { return readseq; }
+       const std::string& getQScore() const { return qscore; }
+
+       void calc_lq(bool, int); // calculate if this read is low quality. Without calling this function, isLowQuality() will always be false
+
+private:
+       int len; // read length
+       std::string readseq, qscore; // qscore : quality scores
  };
  
  bool SingleReadQ::read(int argc, std::istream* argv[], int flags) {
-  std::string line;
-
-  assert(argc == 1);
-  if (!getline((*argv[0]), line)) return false;
-  if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
-  name = "";
-  if (flags & 4) { name = line.substr(1); }
-  if (!getline((*argv[0]), readseq)) return false;
-  len = readseq.length();
-  if (!(flags & 1)) { readseq = ""; }
-  if (!getline((*argv[0]), line)) return false;
-  if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
-  if (!getline((*argv[0]), qscore)) return false;
-  if (!(flags & 2)) { qscore = ""; }
-
-  if (flags & 1) calc_lq();
-
-  return true;
+       std::string line;
+
+       assert(argc == 1);
+       if (!getline((*argv[0]), line)) return false;
+       if (line[0] != '@') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
+       name = "";
+       if (flags & 4) { name = line.substr(1); }
+       if (!getline((*argv[0]), readseq)) return false;
+       len = readseq.length();
+       if (!(flags & 1)) { readseq = ""; }
+       if (!getline((*argv[0]), line)) return false;
+       if (line[0] != '+') { fprintf(stderr, "Read file does not look like a FASTQ file!\n"); exit(-1); }
+       if (!getline((*argv[0]), qscore)) return false;
+       if (!(flags & 2)) { qscore = ""; }
+
+       return true;
  }
  
  void SingleReadQ::write(int argc, std::ostream* argv[]) {
@@ -63,32 +59,40 @@ void SingleReadQ::write(int argc, std::ostream* argv[]) {
         (*argv[0])<<"@"<<name<<std::endl<<readseq<<std::endl<<"+\n"<<qscore<<std::endl;
  }
  
-void SingleReadQ::calc_lq() {
-  int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
-  int threshold_1, threshold_2;
-
-  if (len < OLEN) { low_quality = true; return; }
-
-  threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
-  threshold_2 = (OLEN - 1) / 2 + 1;
-  for (int i = 0; i < len; i++) {
-    if (readseq[i] == 'A') { 
-      ++numA;
-      if (i < OLEN) ++numAO;
-    }
-    if (readseq[i] == 'T') {
-      ++numT;
-      if (i >= len - OLEN) ++numTO; 
-    }
-  }
-  
-  if (numA >= threshold_1) {
-    low_quality = (numAO >= threshold_2);
-  }
-  else if (numT >= threshold_1) {
-    low_quality = (numTO >= threshold_2);
-  }
-  else low_quality = false;
+//calculate if this read is low quality
+void SingleReadQ::calc_lq(bool hasPolyA, int seedLen) {
+       low_quality = false;
+       if (len < seedLen) { low_quality = true; return; }
+
+       // if no polyA, no need to do the following calculation
+       if (!hasPolyA) return;
+
+       assert(readseq != "");
+
+       int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
+       int threshold_1, threshold_2;
+
+
+       threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
+       threshold_2 = (OLEN - 1) / 2 + 1;
+       for (int i = 0; i < len; i++) {
+               if (readseq[i] == 'A') {
+                       ++numA;
+                       if (i < OLEN) ++numAO;
+               }
+               if (readseq[i] == 'T') {
+                       ++numT;
+                       if (i >= len - OLEN) ++numTO;
+               }
+       }
+
+       if (numA >= threshold_1) {
+               low_quality = (numAO >= threshold_2);
+       }
+       else if (numT >= threshold_1) {
+               low_quality = (numTO >= threshold_2);
+       }
+       else low_quality = false;
  }
  
  #endif
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 774065a33aced3b9003285be20c52b3cc7017832..cbfef53fd7da453c07c29a5830691933aa8be1fc 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -113,9 +113,11 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v
  pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
-pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25);
+pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
  pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
  
+if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
+
  if ($strand_specific) { $probF = 1.0; }
  
  my $mate1_list = "";
@@ -472,7 +474,7 @@ Calculate 95% credibility intervals and posterior mean estimates.  (Default: off
  
  =item B<--seed-length> <int>
  
-Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25)
+Seed length used by the read aligner.  Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least one of its mates' (for paired-end reads) length less than this value will be ignored. If the references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum allowed value is 25. Note that this script will only check if the value >= 5 and give a warning message if the value < 25 but >= 5. (Default: 25)
  
  =item B<--tag> <string>
  
@@ -560,7 +562,7 @@ One simple way to make the alignment file (e.g. input.sam) satisfying RSEM's req
  
    sort -k 1,1 -s input.sam > input.sorted.sam
  
-The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. 
+The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL not be '*'. 
  
  The user must run 'rsem-prepare-reference' with the appropriate reference before using this program.
  
@@ -697,4 +699,3 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             mmliver_paired_end_quals
  
  =cut
-
diff --git a/rsem-plot-transcript-wiggles b/rsem-plot-transcript-wiggles

index e0e83c0a389632750b47627b7e5e0764dcde0edc..5054f5bc0d4fa3256bc4ac4899162c4b4e570847 100755 (executable)
--- a/rsem-plot-transcript-wiggles
+++ b/rsem-plot-transcript-wiggles
@@ -80,11 +80,11 @@ The name of the sample analyzed.
  
  =item B<input_list>
  
-A list of transcript ids or gene ids. But it cannot be a mixture of transcript & gene ids.
+A list of transcript ids or gene ids. But it cannot be a mixture of transcript & gene ids. Each id occupies one line without extra spaces.
  
  =item B<output_plot_file>
  
-The file name for the plots.
+The file name of the pdf file which contains all plots.
  
  =back
  
@@ -98,7 +98,7 @@ The input-list is a list of gene ids. (Default: off)
  
  =item B<--show-unique>
  
-Show unique wiggle. (Default: off)
+Show the wiggle plots as stacked bar plots. See description section for details. (Default: off)
  
  =item B<-h/--help>
  
@@ -108,14 +108,36 @@ Show help information.
  
  =head1 DESCRIPTION
  
-blablabla
+This program generates transcript wiggle plots and outputs them in a pdf file. This program can accept either a list of transcript ids or gene ids (if transcript to gene mapping information is provided) and has two modes of showing wiggle plots. If '--show-unique' is not specified, the wiggle plot for each transcript is a histogram where each position has the expected read depth at this position as its height. If '--show-unique' is specified, for each transcript a stacked bar plot is generated. For each position, the read depth of unique reads, which have only one alignment, is showed in black. The read depth of multi-reads, which align to more than one places, is showed in red on top of the read depth of unique reads.This program will use some files RSEM generated previouslly. So please do not delete/move any file 'rsem-calculate-expression' generated.
  
  =head1 OUTPUT
  
-blablabla
+=over
+
+=item B<output_plot_file>
+
+This is a pdf file containing all plots generated. If a list of transcript ids is provided, each page display at most 6 plots in 3 rows and 2 columns. If gene ids are provided, each page display a gene. The gene's id is showed at the top and all its transcripts' wiggle plots are showed in this page. The arrangment of plots is determined automatically. For each transcript wiggle plot, the transcript id is displayed as title. x-axis is position in the transcript and y-axis is read depth. 
+
+=item B<sample_name.transcript.sorted.bam and sample_name.transcript.readdepth>
+
+If these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them.
+
+=item B<sample_name.uniq.transcript.bam, sample_name.uniq.transcript.sorted.bam and sample_name.uniq.transcript.readdepth>
+
+If '--show-unique' option is specified and these files do not exist, 'rsem-plot-transcript-wiggles' will automatically generate them. 
+
+=back
  
  =head1 EXAMPLES
  
-blablabla
+Suppose sample_name and output_plot_file are set to 'mmliver_single_quals' and 'output.pdf' respectively. input_list is set to 'transcript_ids.txt' if transcript ids are provided, and is set to 'gene_ids.txt' if gene ids are provided.
+
+1) Transcript ids are provided and we just want normal wiggle plots:
+
+ rsem-plot-transcript-wiggles mmliver_single_quals transcript_ids.txt output.pdf
+
+2) Gene ids are provided and we want to show stacked bar plots:
+
+ rsem-plot-transcript-wiggles --gene-list --show-unique mmliver_single_quals gene_ids.txt output.pdf 
  
  =cut
diff --git a/sam/bam2depth.c b/sam/bam2depth.c

new file mode 100644 (file)

index 0000000..ca36b89
--- /dev/null
+++ b/sam/bam2depth.c
@@ -0,0 +1,112 @@
+/* This program demonstrates how to generate pileup from multiple BAMs
+ * simutaneously, to achieve random access and to use the BED interface.
+ * To compile this program separately, you may:
+ *
+ *   gcc -g -O2 -Wall -o bam2depth -D_MAIN_BAM2DEPTH bam2depth.c -L. -lbam -lz
+ */
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+#include "bam.h"
+
+typedef struct {     // auxiliary data structure
+       bamFile fp;      // the file handler
+       bam_iter_t iter; // NULL if a region not specified
+       int min_mapQ;    // mapQ filter
+} aux_t;
+
+void *bed_read(const char *fn); // read a BED or position list file
+void bed_destroy(void *_h);     // destroy the BED data structure
+int bed_overlap(const void *_h, const char *chr, int beg, int end); // test if chr:beg-end overlaps
+
+// This function reads a BAM alignment from one BAM file.
+static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup
+{
+       aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure
+       int ret = aux->iter? bam_iter_read(aux->fp, aux->iter, b) : bam_read1(aux->fp, b);
+       if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP;
+       return ret;
+}
+
+#ifdef _MAIN_BAM2DEPTH
+int main(int argc, char *argv[])
+#else
+int main_depth(int argc, char *argv[])
+#endif
+{
+       int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0;
+       const bam_pileup1_t **plp;
+       char *reg = 0; // specified region
+       void *bed = 0; // BED data structure
+       bam_header_t *h = 0; // BAM header of the 1st input
+       aux_t **data;
+       bam_mplp_t mplp;
+
+       // parse the command line
+       while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) {
+               switch (n) {
+                       case 'r': reg = strdup(optarg); break;   // parsing a region requires a BAM header
+                       case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now
+                       case 'q': baseQ = atoi(optarg); break;   // base quality threshold
+                       case 'Q': mapQ = atoi(optarg); break;    // mapping quality threshold
+               }
+       }
+       if (optind == argc) {
+               fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n");
+               return 1;
+       }
+
+       // initialize the auxiliary data structures
+       n = argc - optind; // the number of BAMs on the command line
+       data = calloc(n, sizeof(void*)); // data[i] for the i-th input
+       beg = 0; end = 1<<30; tid = -1;  // set the default region
+       for (i = 0; i < n; ++i) {
+               bam_header_t *htmp;
+               data[i] = calloc(1, sizeof(aux_t));
+               data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM
+               data[i]->min_mapQ = mapQ;                    // set the mapQ filter
+               htmp = bam_header_read(data[i]->fp);         // read the BAM header
+               if (i == 0) {
+                       h = htmp; // keep the header of the 1st BAM
+                       if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region
+               } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header
+               if (tid >= 0) { // if a region is specified and parsed successfully
+                       bam_index_t *idx = bam_index_load(argv[optind+i]);  // load the index
+                       data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator
+                       bam_index_destroy(idx); // the index is not needed any more; phase out of the memory
+               }
+       }
+
+       // the core multi-pileup loop
+       mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
+       n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
+       plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp)
+       while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
+               if (pos < beg || pos >= end) continue; // out of range; skip
+               if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
+               fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster
+               for (i = 0; i < n; ++i) { // base level filters have to go here
+                       int j, m = 0;
+                       for (j = 0; j < n_plp[i]; ++j) {
+                               const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
+                               if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
+                               else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
+                       }
+                       printf("\t%d", n_plp[i] - m); // this the depth to output
+               }
+               putchar('\n');
+       }
+       free(n_plp); free(plp);
+       bam_mplp_destroy(mplp);
+
+       bam_header_destroy(h);
+       for (i = 0; i < n; ++i) {
+               bam_close(data[i]->fp);
+               if (data[i]->iter) bam_iter_destroy(data[i]->iter);
+               free(data[i]);
+       }
+       free(data); free(reg);
+       if (bed) bed_destroy(bed);
+       return 0;
+}
diff --git a/sam/bam_cat.c b/sam/bam_cat.c

new file mode 100644 (file)

index 0000000..0fde045
--- /dev/null
+++ b/sam/bam_cat.c
@@ -0,0 +1,184 @@
+/*\r
+\r
+bam_cat -- efficiently concatenates bam files\r
+\r
+bam_cat can be used to concatenate BAM files. Under special\r
+circumstances, it can be used as an alternative to 'samtools merge' to\r
+concatenate multiple sorted files into a single sorted file. For this\r
+to work each file must be sorted, and the sorted files must be given\r
+as command line arguments in order such that the final read in file i\r
+is less than or equal to the first read in file i+1.\r
+\r
+This code is derived from the bam_reheader function in samtools 0.1.8\r
+and modified to perform concatenation by Chris Saunders on behalf of\r
+Illumina.\r
+\r
+\r
+########## License:\r
+\r
+The MIT License\r
+\r
+Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.\r
+Modified SAMtools work copyright (c) 2010 Illumina, Inc.\r
+\r
+Permission is hereby granted, free of charge, to any person obtaining a copy\r
+of this software and associated documentation files (the "Software"), to deal\r
+in the Software without restriction, including without limitation the rights\r
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
+copies of the Software, and to permit persons to whom the Software is\r
+furnished to do so, subject to the following conditions:\r
+\r
+The above copyright notice and this permission notice shall be included in\r
+all copies or substantial portions of the Software.\r
+\r
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\r
+THE SOFTWARE.\r
+\r
+*/\r
+\r
+\r
+/*\r
+makefile:\r
+"""\r
+CC=gcc\r
+CFLAGS+=-g -Wall -O2 -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -I$(SAMTOOLS_DIR)\r
+LDFLAGS+=-L$(SAMTOOLS_DIR)\r
+LDLIBS+=-lbam -lz\r
+\r
+all:bam_cat\r
+"""\r
+*/\r
+\r
+\r
+#include <stdio.h>\r
+#include <stdlib.h>\r
+#include <unistd.h>\r
+\r
+#include "bgzf.h"\r
+#include "bam.h"\r
+\r
+#define BUF_SIZE 0x10000\r
+\r
+#define GZIPID1 31\r
+#define GZIPID2 139\r
+\r
+#define BGZF_EMPTY_BLOCK_SIZE 28\r
+\r
+\r
+int bam_cat(int nfn, char * const *fn, const bam_header_t *h, const char* outbam)\r
+{\r
+    BGZF *fp;\r
+    FILE* fp_file;\r
+    uint8_t *buf;\r
+    uint8_t ebuf[BGZF_EMPTY_BLOCK_SIZE];\r
+    const int es=BGZF_EMPTY_BLOCK_SIZE;\r
+    int i;\r
+    \r
+    fp = strcmp(outbam, "-")? bgzf_open(outbam, "w") : bgzf_fdopen(fileno(stdout), "w");\r
+    if (fp == 0) {\r
+        fprintf(stderr, "[%s] ERROR: fail to open output file '%s'.\n", __func__, outbam);\r
+        return 1;\r
+    }\r
+    if (h) bam_header_write(fp, h);\r
+    \r
+    buf = (uint8_t*) malloc(BUF_SIZE);\r
+    for(i = 0; i < nfn; ++i){\r
+        BGZF *in;\r
+        bam_header_t *old;\r
+        int len,j;\r
+        \r
+        in = strcmp(fn[i], "-")? bam_open(fn[i], "r") : bam_dopen(fileno(stdin), "r");\r
+        if (in == 0) {\r
+            fprintf(stderr, "[%s] ERROR: fail to open file '%s'.\n", __func__, fn[i]);\r
+            return -1;\r
+        }\r
+        if (in->open_mode != 'r') return -1;\r
+        \r
+        old = bam_header_read(in);\r
+               if (h == 0 && i == 0) bam_header_write(fp, old);\r
+        \r
+        if (in->block_offset < in->block_length) {\r
+            bgzf_write(fp, in->uncompressed_block + in->block_offset, in->block_length - in->block_offset);\r
+            bgzf_flush(fp);\r
+        }\r
+        \r
+        j=0;\r
+#ifdef _USE_KNETFILE\r
+        fp_file=fp->x.fpw;\r
+        while ((len = knet_read(in->x.fpr, buf, BUF_SIZE)) > 0) {\r
+#else  \r
+        fp_file=fp->file;\r
+        while (!feof(in->file) && (len = fread(buf, 1, BUF_SIZE, in->file)) > 0) {\r
+#endif\r
+            if(len<es){\r
+                int diff=es-len;\r
+                if(j==0) {\r
+                    fprintf(stderr, "[%s] ERROR: truncated file?: '%s'.\n", __func__, fn[i]);\r
+                    return -1;\r
+                }\r
+                fwrite(ebuf, 1, len, fp_file);\r
+                memcpy(ebuf,ebuf+len,diff);\r
+                memcpy(ebuf+diff,buf,len);\r
+            } else {\r
+                if(j!=0) fwrite(ebuf, 1, es, fp_file);\r
+                len-= es;\r
+                memcpy(ebuf,buf+len,es);\r
+                fwrite(buf, 1, len, fp_file);\r
+            }\r
+            j=1;\r
+        }\r
+\r
+        /* check final gzip block */\r
+        {\r
+            const uint8_t gzip1=ebuf[0];\r
+            const uint8_t gzip2=ebuf[1];\r
+            const uint32_t isize=*((uint32_t*)(ebuf+es-4));\r
+            if(((gzip1!=GZIPID1) || (gzip2!=GZIPID2)) || (isize!=0)) {\r
+                fprintf(stderr, "[%s] WARNING: Unexpected block structure in file '%s'.", __func__, fn[i]);\r
+                fprintf(stderr, " Possible output corruption.\n");\r
+                fwrite(ebuf, 1, es, fp_file);\r
+            }\r
+        }\r
+        bam_header_destroy(old);\r
+        bgzf_close(in);\r
+    }\r
+    free(buf);\r
+    bgzf_close(fp);\r
+    return 0;\r
+}\r
+\r
+\r
+\r
+int main_cat(int argc, char *argv[])\r
+{\r
+    bam_header_t *h = 0;\r
+       char *outfn = 0;\r
+       int c, ret;\r
+       while ((c = getopt(argc, argv, "h:o:")) >= 0) {\r
+               switch (c) {\r
+                       case 'h': {\r
+                       tamFile fph = sam_open(optarg);\r
+                       if (fph == 0) {\r
+                       fprintf(stderr, "[%s] ERROR: fail to read the header from '%s'.\n", __func__, argv[1]);\r
+                           return 1;\r
+                       }\r
+                   h = sam_header_read(fph);\r
+               sam_close(fph);\r
+                               break;\r
+                       }\r
+                       case 'o': outfn = strdup(optarg); break;\r
+               }\r
+       }\r
+       if (argc - optind < 2) {\r
+        fprintf(stderr, "Usage: samtools cat [-h header.sam] [-o out.bam] <in1.bam> <in2.bam> [...]\n");\r
+        return 1;\r
+    }\r
+    ret = bam_cat(argc - optind, argv + optind, h, outfn? outfn : "-");\r
+       free(outfn);\r
+       return ret;\r
+}\r
diff --git a/sam/bcftools/em.c b/sam/bcftools/em.c

new file mode 100644 (file)

index 0000000..b7dfe1a
--- /dev/null
+++ b/sam/bcftools/em.c
@@ -0,0 +1,310 @@
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "bcf.h"
+#include "kmin.h"
+
+static double g_q2p[256];
+
+#define ITER_MAX 50
+#define ITER_TRY 10
+#define EPS 1e-5
+
+extern double kf_gammaq(double, double);
+
+/*
+       Generic routines
+ */
+// get the 3 genotype likelihoods
+static double *get_pdg3(const bcf1_t *b)
+{
+       double *pdg;
+       const uint8_t *PL = 0;
+       int i, PL_len = 0;
+       // initialize g_q2p if necessary
+       if (g_q2p[0] == 0.)
+               for (i = 0; i < 256; ++i)
+                       g_q2p[i] = pow(10., -i / 10.);
+       // set PL and PL_len
+       for (i = 0; i < b->n_gi; ++i) {
+               if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
+                       PL = (const uint8_t*)b->gi[i].data;
+                       PL_len = b->gi[i].len;
+                       break;
+               }
+       }
+       if (i == b->n_gi) return 0; // no PL
+       // fill pdg
+       pdg = malloc(3 * b->n_smpl * sizeof(double));
+       for (i = 0; i < b->n_smpl; ++i) {
+               const uint8_t *pi = PL + i * PL_len;
+               double *p = pdg + i * 3;
+               p[0] = g_q2p[pi[2]]; p[1] = g_q2p[pi[1]]; p[2] = g_q2p[pi[0]];
+       }
+       return pdg;
+}
+
+// estimate site allele frequency in a very naive and inaccurate way
+static double est_freq(int n, const double *pdg)
+{
+       int i, gcnt[3], tmp1;
+       // get a rough estimate of the genotype frequency
+       gcnt[0] = gcnt[1] = gcnt[2] = 0;
+       for (i = 0; i < n; ++i) {
+               const double *p = pdg + i * 3;
+               if (p[0] != 1. || p[1] != 1. || p[2] != 1.) {
+                       int which = p[0] > p[1]? 0 : 1;
+                       which = p[which] > p[2]? which : 2;
+                       ++gcnt[which];
+               }
+       }
+       tmp1 = gcnt[0] + gcnt[1] + gcnt[2];
+       return (tmp1 == 0)? -1.0 : (.5 * gcnt[1] + gcnt[2]) / tmp1;
+}
+
+/*
+       Single-locus EM
+ */
+
+typedef struct {
+       int beg, end;
+       const double *pdg;
+} minaux1_t;
+
+static double prob1(double f, void *data)
+{
+       minaux1_t *a = (minaux1_t*)data;
+       double p = 1., l = 0., f3[3];
+       int i;
+//     printf("brent %lg\n", f);
+       if (f < 0 || f > 1) return 1e300;
+       f3[0] = (1.-f)*(1.-f); f3[1] = 2.*f*(1.-f); f3[2] = f*f;
+       for (i = a->beg; i < a->end; ++i) {
+               const double *pdg = a->pdg + i * 3;
+               p *= pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2];
+               if (p < 1e-200) l -= log(p), p = 1.;
+       }
+       return l - log(p);
+}
+
+// one EM iteration for allele frequency estimate
+static double freq_iter(double *f, const double *_pdg, int beg, int end)
+{
+       double f0 = *f, f3[3], err;
+       int i;
+//     printf("em %lg\n", *f);
+       f3[0] = (1.-f0)*(1.-f0); f3[1] = 2.*f0*(1.-f0); f3[2] = f0*f0;
+       for (i = beg, f0 = 0.; i < end; ++i) {
+               const double *pdg = _pdg + i * 3;
+               f0 += (pdg[1] * f3[1] + 2. * pdg[2] * f3[2])
+                       / (pdg[0] * f3[0] + pdg[1] * f3[1] + pdg[2] * f3[2]);
+       }
+       f0 /= (end - beg) * 2;
+       err = fabs(f0 - *f);
+       *f = f0;
+       return err;
+}
+
+/* The following function combines EM and Brent's method. When the signal from
+ * the data is strong, EM is faster but sometimes, EM may converge very slowly.
+ * When this happens, we switch to Brent's method. The idea is learned from
+ * Rasmus Nielsen.
+ */
+static double freqml(double f0, int beg, int end, const double *pdg)
+{
+       int i;
+       double f;
+       for (i = 0, f = f0; i < ITER_TRY; ++i)
+               if (freq_iter(&f, pdg, beg, end) < EPS) break;
+       if (i == ITER_TRY) { // haven't converged yet; try Brent's method
+               minaux1_t a;
+               a.beg = beg; a.end = end; a.pdg = pdg;
+               kmin_brent(prob1, f0 == f? .5*f0 : f0, f, (void*)&a, EPS, &f);
+       }
+       return f;
+}
+
+// one EM iteration for genotype frequency estimate
+static double g3_iter(double g[3], const double *_pdg, int beg, int end)
+{
+       double err, gg[3];
+       int i;
+       gg[0] = gg[1] = gg[2] = 0.;
+//     printf("%lg,%lg,%lg\n", g[0], g[1], g[2]);
+       for (i = beg; i < end; ++i) {
+               double sum, tmp[3];
+               const double *pdg = _pdg + i * 3;
+               tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
+               sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
+               gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
+       }
+       err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
+       err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
+       g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
+       return err;
+}
+
+// perform likelihood ratio test
+static double lk_ratio_test(int n, int n1, const double *pdg, double f3[3][3])
+{
+       double r;
+       int i;
+       for (i = 0, r = 1.; i < n1; ++i) {
+               const double *p = pdg + i * 3;
+               r *= (p[0] * f3[1][0] + p[1] * f3[1][1] + p[2] * f3[1][2])
+                       / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+       }
+       for (; i < n; ++i) {
+               const double *p = pdg + i * 3;
+               r *= (p[0] * f3[2][0] + p[1] * f3[2][1] + p[2] * f3[2][2])
+                       / (p[0] * f3[0][0] + p[1] * f3[0][1] + p[2] * f3[0][2]);
+       }
+       return r;
+}
+
+// x[0]: ref frequency
+// x[1..3]: alt-alt, alt-ref, ref-ref frequenc
+// x[4]: HWE P-value
+// x[5..6]: group1 freq, group2 freq
+// x[7]: 1-degree P-value
+// x[8]: 2-degree P-value
+int bcf_em1(const bcf1_t *b, int n1, int flag, double x[10])
+{
+       double *pdg;
+       int i, n, n2;
+       if (b->n_alleles < 2) return -1; // one allele only
+       // initialization
+       if (n1 < 0 || n1 > b->n_smpl) n1 = 0;
+       if (flag & 1<<7) flag |= 7<<5; // compute group freq if LRT is required
+       if (flag & 0xf<<1) flag |= 0xf<<1;
+       n = b->n_smpl; n2 = n - n1;
+       pdg = get_pdg3(b);
+       if (pdg == 0) return -1;
+       for (i = 0; i < 10; ++i) x[i] = -1.; // set to negative
+       {
+               if ((x[0] = est_freq(n, pdg)) < 0.) {
+                       free(pdg);
+                       return -1; // no data
+               }
+               x[0] = freqml(x[0], 0, n, pdg);
+       }
+       if (flag & (0xf<<1|3<<8)) { // estimate the genotype frequency and test HWE
+               double *g = x + 1, f3[3], r;
+               f3[0] = g[0] = (1 - x[0]) * (1 - x[0]);
+               f3[1] = g[1] = 2 * x[0] * (1 - x[0]);
+               f3[2] = g[2] = x[0] * x[0];
+               for (i = 0; i < ITER_MAX; ++i)
+                       if (g3_iter(g, pdg, 0, n) < EPS) break;
+               // Hardy-Weinberg equilibrium (HWE)
+               for (i = 0, r = 1.; i < n; ++i) {
+                       double *p = pdg + i * 3;
+                       r *= (p[0] * g[0] + p[1] * g[1] + p[2] * g[2]) / (p[0] * f3[0] + p[1] * f3[1] + p[2] * f3[2]);
+               }
+               x[4] = kf_gammaq(.5, log(r));
+       }
+       if ((flag & 7<<5) && n1 > 0 && n1 < n) { // group frequency
+               x[5] = freqml(x[0], 0, n1, pdg);
+               x[6] = freqml(x[0], n1, n, pdg);
+       }
+       if ((flag & 1<<7) && n1 > 0 && n1 < n) { // 1-degree P-value
+               double f[3], f3[3][3], tmp;
+               f[0] = x[0]; f[1] = x[5]; f[2] = x[6];
+               for (i = 0; i < 3; ++i)
+                       f3[i][0] = (1-f[i])*(1-f[i]), f3[i][1] = 2*f[i]*(1-f[i]), f3[i][2] = f[i]*f[i];
+               tmp = log(lk_ratio_test(n, n1, pdg, f3));
+               if (tmp < 0) tmp = 0;
+               x[7] = kf_gammaq(.5, tmp);
+       }
+       if ((flag & 3<<8) && n1 > 0 && n1 < n) { // 2-degree P-value
+               double g[3][3], tmp;
+               for (i = 0; i < 3; ++i) memcpy(g[i], x + 1, 3 * sizeof(double));
+               for (i = 0; i < ITER_MAX; ++i)
+                       if (g3_iter(g[1], pdg, 0, n1) < EPS) break;
+               for (i = 0; i < ITER_MAX; ++i)
+                       if (g3_iter(g[2], pdg, n1, n) < EPS) break;
+               tmp = log(lk_ratio_test(n, n1, pdg, g));
+               if (tmp < 0) tmp = 0;
+               x[8] = kf_gammaq(1., tmp);
+       }
+       // free
+       free(pdg);
+       return 0;
+}
+
+/*
+       Two-locus EM (LD)
+ */
+
+#define _G1(h, k) ((h>>1&1) + (k>>1&1))
+#define _G2(h, k) ((h&1) + (k&1))
+
+// 0: the previous site; 1: the current site
+static int pair_freq_iter(int n, double *pdg[2], double f[4])
+{
+       double ff[4];
+       int i, k, h;
+//     printf("%lf,%lf,%lf,%lf\n", f[0], f[1], f[2], f[3]);
+       memset(ff, 0, 4 * sizeof(double));
+       for (i = 0; i < n; ++i) {
+               double *p[2], sum, tmp;
+               p[0] = pdg[0] + i * 3; p[1] = pdg[1] + i * 3;
+               for (k = 0, sum = 0.; k < 4; ++k)
+                       for (h = 0; h < 4; ++h)
+                               sum += f[k] * f[h] * p[0][_G1(k,h)] * p[1][_G2(k,h)];
+               for (k = 0; k < 4; ++k) {
+                       tmp = f[0] * (p[0][_G1(0,k)] * p[1][_G2(0,k)] + p[0][_G1(k,0)] * p[1][_G2(k,0)])
+                               + f[1] * (p[0][_G1(1,k)] * p[1][_G2(1,k)] + p[0][_G1(k,1)] * p[1][_G2(k,1)])
+                               + f[2] * (p[0][_G1(2,k)] * p[1][_G2(2,k)] + p[0][_G1(k,2)] * p[1][_G2(k,2)])
+                               + f[3] * (p[0][_G1(3,k)] * p[1][_G2(3,k)] + p[0][_G1(k,3)] * p[1][_G2(k,3)]);
+                       ff[k] += f[k] * tmp / sum;
+               }
+       }
+       for (k = 0; k < 4; ++k) f[k] = ff[k] / (2 * n);
+       return 0;
+}
+
+double bcf_pair_freq(const bcf1_t *b0, const bcf1_t *b1, double f[4])
+{
+       const bcf1_t *b[2];
+       int i, j, n_smpl;
+       double *pdg[2], flast[4], r, f0[2];
+       // initialize others
+       if (b0->n_smpl != b1->n_smpl) return -1; // different number of samples
+       n_smpl = b0->n_smpl;
+       b[0] = b0; b[1] = b1;
+       f[0] = f[1] = f[2] = f[3] = -1.;
+       if (b[0]->n_alleles < 2 || b[1]->n_alleles < 2) return -1; // one allele only
+       pdg[0] = get_pdg3(b0); pdg[1] = get_pdg3(b1);
+       if (pdg[0] == 0 || pdg[1] == 0) {
+               free(pdg[0]); free(pdg[1]);
+               return -1;
+       }
+       // set the initial value
+       f0[0] = est_freq(n_smpl, pdg[0]);
+       f0[1] = est_freq(n_smpl, pdg[1]);
+       f[0] = (1 - f0[0]) * (1 - f0[1]); f[3] = f0[0] * f0[1];
+       f[1] = (1 - f0[0]) * f0[1]; f[2] = f0[0] * (1 - f0[1]);
+       // iteration
+       for (j = 0; j < ITER_MAX; ++j) {
+               double eps = 0;
+               memcpy(flast, f, 4 * sizeof(double));
+               pair_freq_iter(n_smpl, pdg, f);
+               for (i = 0; i < 4; ++i) {
+                       double x = fabs(f[i] - flast[i]);
+                       if (x > eps) eps = x;
+               }
+               if (eps < EPS) break;
+       }
+       // free
+       free(pdg[0]); free(pdg[1]);
+       { // calculate r^2
+               double p[2], q[2], D;
+               p[0] = f[0] + f[1]; q[0] = 1 - p[0];
+               p[1] = f[0] + f[2]; q[1] = 1 - p[1];
+               D = f[0] * f[3] - f[1] * f[2];
+               r = sqrt(D * D / (p[0] * p[1] * q[0] * q[1]));
+//             printf("R(%lf,%lf,%lf,%lf)=%lf\n", f[0], f[1], f[2], f[3], r);
+               if (isnan(r)) r = -1.;
+       }
+       return r;
+}
diff --git a/sam/bcftools/kmin.c b/sam/bcftools/kmin.c

new file mode 100644 (file)

index 0000000..5b8193b
--- /dev/null
+++ b/sam/bcftools/kmin.c
@@ -0,0 +1,209 @@
+/* The MIT License
+
+   Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+/* Hooke-Jeeves algorithm for nonlinear minimization
+ 
+   Based on the pseudocodes by Bell and Pike (CACM 9(9):684-685), and
+   the revision by Tomlin and Smith (CACM 12(11):637-638). Both of the
+   papers are comments on Kaupe's Algorithm 178 "Direct Search" (ACM
+   6(6):313-314). The original algorithm was designed by Hooke and
+   Jeeves (ACM 8:212-229). This program is further revised according to
+   Johnson's implementation at Netlib (opt/hooke.c).
+ 
+   Hooke-Jeeves algorithm is very simple and it works quite well on a
+   few examples. However, it might fail to converge due to its heuristic
+   nature. A possible improvement, as is suggested by Johnson, may be to
+   choose a small r at the beginning to quickly approach to the minimum
+   and a large r at later step to hit the minimum.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "kmin.h"
+
+static double __kmin_hj_aux(kmin_f func, int n, double *x1, void *data, double fx1, double *dx, int *n_calls)
+{
+       int k, j = *n_calls;
+       double ftmp;
+       for (k = 0; k != n; ++k) {
+               x1[k] += dx[k];
+               ftmp = func(n, x1, data); ++j;
+               if (ftmp < fx1) fx1 = ftmp;
+               else { /* search the opposite direction */
+                       dx[k] = 0.0 - dx[k];
+                       x1[k] += dx[k] + dx[k];
+                       ftmp = func(n, x1, data); ++j;
+                       if (ftmp < fx1) fx1 = ftmp;
+                       else x1[k] -= dx[k]; /* back to the original x[k] */
+               }
+       }
+       *n_calls = j;
+       return fx1; /* here: fx1=f(n,x1) */
+}
+
+double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls)
+{
+       double fx, fx1, *x1, *dx, radius;
+       int k, n_calls = 0;
+       x1 = (double*)calloc(n, sizeof(double));
+       dx = (double*)calloc(n, sizeof(double));
+       for (k = 0; k != n; ++k) { /* initial directions, based on MGJ */
+               dx[k] = fabs(x[k]) * r;
+               if (dx[k] == 0) dx[k] = r;
+       }
+       radius = r;
+       fx1 = fx = func(n, x, data); ++n_calls;
+       for (;;) {
+               memcpy(x1, x, n * sizeof(double)); /* x1 = x */
+               fx1 = __kmin_hj_aux(func, n, x1, data, fx, dx, &n_calls);
+               while (fx1 < fx) {
+                       for (k = 0; k != n; ++k) {
+                               double t = x[k];
+                               dx[k] = x1[k] > x[k]? fabs(dx[k]) : 0.0 - fabs(dx[k]);
+                               x[k] = x1[k];
+                               x1[k] = x1[k] + x1[k] - t;
+                       }
+                       fx = fx1;
+                       if (n_calls >= max_calls) break;
+                       fx1 = func(n, x1, data); ++n_calls;
+                       fx1 = __kmin_hj_aux(func, n, x1, data, fx1, dx, &n_calls);
+                       if (fx1 >= fx) break;
+                       for (k = 0; k != n; ++k)
+                               if (fabs(x1[k] - x[k]) > .5 * fabs(dx[k])) break;
+                       if (k == n) break;
+               }
+               if (radius >= eps) {
+                       if (n_calls >= max_calls) break;
+                       radius *= r;
+                       for (k = 0; k != n; ++k) dx[k] *= r;
+               } else break; /* converge */
+       }
+       free(x1); free(dx);
+       return fx1;
+}
+
+// I copied this function somewhere several years ago with some of my modifications, but I forgot the source.
+double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin)
+{
+       double bound, u, r, q, fu, tmp, fa, fb, fc, c;
+       const double gold1 = 1.6180339887;
+       const double gold2 = 0.3819660113;
+       const double tiny = 1e-20;
+       const int max_iter = 100;
+
+       double e, d, w, v, mid, tol1, tol2, p, eold, fv, fw;
+       int iter;
+
+       fa = func(a, data); fb = func(b, data);
+       if (fb > fa) { // swap, such that f(a) > f(b)
+               tmp = a; a = b; b = tmp;
+               tmp = fa; fa = fb; fb = tmp;
+       }
+       c = b + gold1 * (b - a), fc = func(c, data); // golden section extrapolation
+       while (fb > fc) {
+               bound = b + 100.0 * (c - b); // the farthest point where we want to go
+               r = (b - a) * (fb - fc);
+               q = (b - c) * (fb - fa);
+               if (fabs(q - r) < tiny) { // avoid 0 denominator
+                       tmp = q > r? tiny : 0.0 - tiny;
+               } else tmp = q - r;
+               u = b - ((b - c) * q - (b - a) * r) / (2.0 * tmp); // u is the parabolic extrapolation point
+               if ((b > u && u > c) || (b < u && u < c)) { // u lies between b and c
+                       fu = func(u, data);
+                       if (fu < fc) { // (b,u,c) bracket the minimum
+                               a = b; b = u; fa = fb; fb = fu;
+                               break;
+                       } else if (fu > fb) { // (a,b,u) bracket the minimum
+                               c = u; fc = fu;
+                               break;
+                       }
+                       u = c + gold1 * (c - b); fu = func(u, data); // golden section extrapolation
+               } else if ((c > u && u > bound) || (c < u && u < bound)) { // u lies between c and bound
+                       fu = func(u, data);
+                       if (fu < fc) { // fb > fc > fu
+                               b = c; c = u; u = c + gold1 * (c - b);
+                               fb = fc; fc = fu; fu = func(u, data);
+                       } else { // (b,c,u) bracket the minimum
+                               a = b; b = c; c = u;
+                               fa = fb; fb = fc; fc = fu;
+                               break;
+                       }
+               } else if ((u > bound && bound > c) || (u < bound && bound < c)) { // u goes beyond the bound
+                       u = bound; fu = func(u, data);
+               } else { // u goes the other way around, use golden section extrapolation
+                       u = c + gold1 * (c - b); fu = func(u, data);
+               }
+               a = b; b = c; c = u;
+               fa = fb; fb = fc; fc = fu;
+       }
+       if (a > c) u = a, a = c, c = u; // swap
+
+       // now, a<b<c, fa>fb and fb<fc, move on to Brent's algorithm
+       e = d = 0.0;
+       w = v = b; fv = fw = fb;
+       for (iter = 0; iter != max_iter; ++iter) {
+               mid = 0.5 * (a + c);
+               tol2 = 2.0 * (tol1 = tol * fabs(b) + tiny);
+               if (fabs(b - mid) <= (tol2 - 0.5 * (c - a))) {
+                       *xmin = b; return fb; // found
+               }
+               if (fabs(e) > tol1) {
+                       // related to parabolic interpolation
+                       r = (b - w) * (fb - fv);
+                       q = (b - v) * (fb - fw);
+                       p = (b - v) * q - (b - w) * r;
+                       q = 2.0 * (q - r);
+                       if (q > 0.0) p = 0.0 - p;
+                       else q = 0.0 - q;
+                       eold = e; e = d;
+                       if (fabs(p) >= fabs(0.5 * q * eold) || p <= q * (a - b) || p >= q * (c - b)) {
+                               d = gold2 * (e = (b >= mid ? a - b : c - b));
+                       } else {
+                               d = p / q; u = b + d; // actual parabolic interpolation happens here
+                               if (u - a < tol2 || c - u < tol2)
+                                       d = (mid > b)? tol1 : 0.0 - tol1;
+                       }
+               } else d = gold2 * (e = (b >= mid ? a - b : c - b)); // golden section interpolation
+               u = fabs(d) >= tol1 ? b + d : b + (d > 0.0? tol1 : -tol1);
+               fu = func(u, data);
+               if (fu <= fb) { // u is the minimum point so far
+                       if (u >= b) a = b;
+                       else c = b;
+                       v = w; w = b; b = u; fv = fw; fw = fb; fb = fu;
+               } else { // adjust (a,c) and (u,v,w)
+                       if (u < b) a = u;
+                       else c = u;
+                       if (fu <= fw || w == b) {
+                               v = w; w = u;
+                               fv = fw; fw = fu;
+                       } else if (fu <= fv || v == b || v == w) {
+                               v = u; fv = fu;
+                       }
+               }
+       }
+       *xmin = b;
+       return fb;
+}
diff --git a/sam/bcftools/kmin.h b/sam/bcftools/kmin.h

new file mode 100644 (file)

index 0000000..6feba45
--- /dev/null
+++ b/sam/bcftools/kmin.h
@@ -0,0 +1,46 @@
+/*
+   Copyright (c) 2008, 2010 by Attractive Chaos <attractor@live.co.uk>
+
+   Permission is hereby granted, free of charge, to any person obtaining
+   a copy of this software and associated documentation files (the
+   "Software"), to deal in the Software without restriction, including
+   without limitation the rights to use, copy, modify, merge, publish,
+   distribute, sublicense, and/or sell copies of the Software, and to
+   permit persons to whom the Software is furnished to do so, subject to
+   the following conditions:
+
+   The above copyright notice and this permission notice shall be
+   included in all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+   SOFTWARE.
+*/
+
+#ifndef KMIN_H
+#define KMIN_H
+
+#define KMIN_RADIUS  0.5
+#define KMIN_EPS     1e-7
+#define KMIN_MAXCALL 50000
+
+typedef double (*kmin_f)(int, double*, void*);
+typedef double (*kmin1_f)(double, void*);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+       double kmin_hj(kmin_f func, int n, double *x, void *data, double r, double eps, int max_calls);
+       double kmin_brent(kmin1_f func, double a, double b, void *data, double tol, double *xmin);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/sam/bcftools/mut.c b/sam/bcftools/mut.c

new file mode 100644 (file)

index 0000000..15ef265
--- /dev/null
+++ b/sam/bcftools/mut.c
@@ -0,0 +1,127 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include "bcf.h"
+
+#define MAX_GENO 359
+
+int8_t seq_bitcnt[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+char *seq_nt16rev = "XACMGRSVTWYHKDBN";
+
+uint32_t *bcf_trio_prep(int is_x, int is_son)
+{
+       int i, j, k, n, map[10];
+       uint32_t *ret;
+       ret = calloc(MAX_GENO, 4);
+       for (i = 0, k = 0; i < 4; ++i)
+               for (j = i; j < 4; ++j)
+                       map[k++] = 1<<i|1<<j;
+       for (i = 0, n = 1; i < 10; ++i) { // father
+               if (is_x && seq_bitcnt[map[i]] != 1) continue;
+               if (is_x && is_son) {
+                       for (j = 0; j < 10; ++j) // mother
+                               for (k = 0; k < 10; ++k) // child
+                                       if (seq_bitcnt[map[k]] == 1 && (map[j]&map[k]))
+                                               ret[n++] = j<<16 | i<<8 | k;
+               } else {
+                       for (j = 0; j < 10; ++j) // mother
+                               for (k = 0; k < 10; ++k) // child
+                                       if ((map[i]&map[k]) && (map[j]&map[k]) && ((map[i]|map[j])&map[k]) == map[k])
+                                               ret[n++] = j<<16 | i<<8 | k;
+               }
+       }
+       ret[0] = n - 1;
+       return ret;
+}
+
+
+int bcf_trio_call(const uint32_t *prep, const bcf1_t *b, int *llr, int64_t *gt)
+{
+       int i, j, k;
+       const bcf_ginfo_t *PL;
+       uint8_t *gl10;
+       int map[10];
+       if (b->n_smpl != 3) return -1; // not a trio
+       for (i = 0; i < b->n_gi; ++i)
+               if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+       if (i == b->n_gi) return -1; // no PL
+       gl10 = alloca(10 * b->n_smpl);
+       if (bcf_gl10(b, gl10) < 0) {
+               if (bcf_gl10_indel(b, gl10) < 0) return -1;
+       }
+       PL = b->gi + i;
+       for (i = 0, k = 0; i < 4; ++i)
+               for (j = i; j < 4; ++j)
+                       map[k++] = seq_nt16rev[1<<i|1<<j];
+       for (j = 0; j < 3; ++j) // check if ref hom is the most probable in all members
+               if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
+       if (j < 3) { // we need to go through the complex procedure
+               uint8_t *g[3];
+               int minc = 1<<30, minc_j = -1, minf = 0, gtf = 0, gtc = 0;
+               g[0] = gl10;
+               g[1] = gl10 + 10;
+               g[2] = gl10 + 20;
+               for (j = 1; j <= (int)prep[0]; ++j) { // compute LK with constraint
+                       int sum = g[0][prep[j]&0xff] + g[1][prep[j]>>8&0xff] + g[2][prep[j]>>16&0xff];
+                       if (sum < minc) minc = sum, minc_j = j;
+               }
+               gtc |= map[prep[minc_j]&0xff]; gtc |= map[prep[minc_j]>>8&0xff]<<8; gtc |= map[prep[minc_j]>>16]<<16;
+               for (j = 0; j < 3; ++j) { // compute LK without constraint
+                       int min = 1<<30, min_k = -1;
+                       for (k = 0; k < 10; ++k)
+                               if (g[j][k] < min) min = g[j][k], min_k = k;
+                       gtf |= map[min_k]<<(j*8);
+                       minf += min;
+               }
+               *llr = minc - minf; *gt = (int64_t)gtc<<32 | gtf;
+       } else *llr = 0, *gt = -1;
+       return 0;
+}
+
+int bcf_pair_call(const bcf1_t *b)
+{
+       int i, j, k;
+       const bcf_ginfo_t *PL;
+       if (b->n_smpl != 2) return -1; // not a pair
+       for (i = 0; i < b->n_gi; ++i)
+               if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+       if (i == b->n_gi) return -1; // no PL
+       PL = b->gi + i;
+       for (j = 0; j < 2; ++j) // check if ref hom is the most probable in all members
+               if (((uint8_t*)PL->data)[j * PL->len] != 0) break;
+       if (j < 2) { // we need to go through the complex procedure
+               uint8_t *g[2];
+               int minc = 1<<30, minf = 0;
+               g[0] = PL->data;
+               g[1] = (uint8_t*)PL->data + PL->len;
+               for (j = 0; j < PL->len; ++j) // compute LK with constraint
+                       minc = minc < g[0][j] + g[1][j]? minc : g[0][j] + g[1][j];
+               for (j = 0; j < 2; ++j) { // compute LK without constraint
+                       int min = 1<<30;
+                       for (k = 0; k < PL->len; ++k)
+                               min = min < g[j][k]? min : g[j][k];
+                       minf += min;
+               }
+               return minc - minf;
+       } else return 0;
+}
+
+int bcf_min_diff(const bcf1_t *b)
+{
+       int i, min = 1<<30;
+       const bcf_ginfo_t *PL;
+       for (i = 0; i < b->n_gi; ++i)
+               if (b->gi[i].fmt == bcf_str2int("PL", 2)) break;
+       if (i == b->n_gi) return -1; // no PL
+       PL = b->gi + i;
+       for (i = 0; i < b->n_smpl; ++i) {
+               int m1, m2, j;
+               const uint8_t *p = (uint8_t*)PL->data;
+               m1 = m2 = 1<<30;
+               for (j = 0; j < PL->len; ++j) {
+                       if ((int)p[j] < m1) m2 = m1, m1 = p[j];
+                       else if ((int)p[j] < m2) m2 = p[j];
+               }
+               min = min < m2 - m1? min : m2 - m1;
+       }
+       return min;
+}
diff --git a/sam/bedidx.c b/sam/bedidx.c

new file mode 100644 (file)

index 0000000..ec75a10
--- /dev/null
+++ b/sam/bedidx.c
@@ -0,0 +1,162 @@
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+#include <zlib.h>
+
+#ifdef _WIN32
+#define drand48() ((double)rand() / RAND_MAX)
+#endif
+
+#include "ksort.h"
+KSORT_INIT_GENERIC(uint64_t)
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 8192)
+
+typedef struct {
+       int n, m;
+       uint64_t *a;
+       int *idx;
+} bed_reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, bed_reglist_t)
+
+#define LIDX_SHIFT 13
+
+typedef kh_reg_t reghash_t;
+
+int *bed_index_core(int n, uint64_t *a, int *n_idx)
+{
+       int i, j, m, *idx;
+       m = *n_idx = 0; idx = 0;
+       for (i = 0; i < n; ++i) {
+               int beg, end;
+               beg = a[i]>>32 >> LIDX_SHIFT; end = ((uint32_t)a[i]) >> LIDX_SHIFT;
+               if (m < end + 1) {
+                       int oldm = m;
+                       m = end + 1;
+                       kroundup32(m);
+                       idx = realloc(idx, m * sizeof(int));
+                       for (j = oldm; j < m; ++j) idx[j] = -1;
+               }
+               if (beg == end) {
+                       if (idx[beg] < 0) idx[beg] = i;
+               } else {
+                       for (j = beg; j <= end; ++j)
+                               if (idx[j] < 0) idx[j] = i;
+               }
+               *n_idx = end + 1;
+       }
+       return idx;
+}
+
+void bed_index(void *_h)
+{
+       reghash_t *h = (reghash_t*)_h;
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       bed_reglist_t *p = &kh_val(h, k);
+                       if (p->idx) free(p->idx);
+                       ks_introsort(uint64_t, p->n, p->a);
+                       p->idx = bed_index_core(p->n, p->a, &p->m);
+               }
+       }
+}
+
+int bed_overlap_core(const bed_reglist_t *p, int beg, int end)
+{
+       int i, min_off;
+       if (p->n == 0) return 0;
+       min_off = (beg>>LIDX_SHIFT >= p->n)? p->idx[p->n-1] : p->idx[beg>>LIDX_SHIFT];
+       if (min_off < 0) { // TODO: this block can be improved, but speed should not matter too much here
+               int n = beg>>LIDX_SHIFT;
+               if (n > p->n) n = p->n;
+               for (i = n - 1; i >= 0; --i)
+                       if (p->idx[i] >= 0) break;
+               min_off = i >= 0? p->idx[i] : 0;
+       }
+       for (i = min_off; i < p->n; ++i) {
+               if ((int)(p->a[i]>>32) >= end) break; // out of range; no need to proceed
+               if ((int32_t)p->a[i] > beg && (int32_t)(p->a[i]>>32) < end)
+                       return 1; // find the overlap; return
+       }
+       return 0;
+}
+
+int bed_overlap(const void *_h, const char *chr, int beg, int end)
+{
+       const reghash_t *h = (const reghash_t*)_h;
+       khint_t k;
+       if (!h) return 0;
+       k = kh_get(reg, h, chr);
+       if (k == kh_end(h)) return 0;
+       return bed_overlap_core(&kh_val(h, k), beg, end);
+}
+
+void *bed_read(const char *fn)
+{
+       reghash_t *h = kh_init(reg);
+       gzFile fp;
+       kstream_t *ks;
+       int dret;
+       kstring_t *str;
+       // read the list
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       if (fp == 0) return 0;
+       str = calloc(1, sizeof(kstring_t));
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) { // read the chr name
+               int beg = -1, end = -1;
+               bed_reglist_t *p;
+               khint_t k = kh_get(reg, h, str->s);
+               if (k == kh_end(h)) { // absent from the hash table
+                       int ret;
+                       char *s = strdup(str->s);
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(bed_reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (dret != '\n') { // if the lines has other characters
+                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                               beg = atoi(str->s); // begin
+                               if (dret != '\n') {
+                                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                                               end = atoi(str->s); // end
+                                               if (end < beg) end = -1;
+                                       }
+                               }
+                       }
+               }
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); // skip the rest of the line
+               if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+               if (beg >= 0 && end > beg) {
+                       if (p->n == p->m) {
+                               p->m = p->m? p->m<<1 : 4;
+                               p->a = realloc(p->a, p->m * 8);
+                       }
+                       p->a[p->n++] = (uint64_t)beg<<32 | end;
+               }
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       bed_index(h);
+       return h;
+}
+
+void bed_destroy(void *_h)
+{
+       reghash_t *h = (reghash_t*)_h;
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free(kh_val(h, k).idx);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+}
diff --git a/sam/cut_target.c b/sam/cut_target.c

new file mode 100644 (file)

index 0000000..26f434f
--- /dev/null
+++ b/sam/cut_target.c
@@ -0,0 +1,193 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include "bam.h"
+#include "errmod.h"
+#include "faidx.h"
+
+#define ERR_DEP 0.83f
+
+typedef struct {
+       int e[2][3], p[2][2];
+} score_param_t;
+
+/* Note that although the two matrics have 10 parameters in total, only 4
+ * (probably 3) are free.  Changing the scoring matrices in a sort of symmetric
+ * way will not change the result. */
+static score_param_t g_param = { {{0,0,0},{-4,1,6}}, {{0,-14000}, {0,0}} };
+
+typedef struct {
+       int min_baseQ, tid, max_bases;
+       uint16_t *bases;
+       bamFile fp;
+       bam_header_t *h;
+       char *ref;
+       faidx_t *fai;
+       errmod_t *em;
+} ct_t;
+
+static uint16_t gencns(ct_t *g, int n, const bam_pileup1_t *plp)
+{
+       int i, j, ret, tmp, k, sum[4], qual;
+       float q[16];
+       if (n > g->max_bases) { // enlarge g->bases
+               g->max_bases = n;
+               kroundup32(g->max_bases);
+               g->bases = realloc(g->bases, g->max_bases * 2);
+       }
+       for (i = k = 0; i < n; ++i) {
+               const bam_pileup1_t *p = plp + i;
+               uint8_t *seq;
+               int q, baseQ, b;
+               if (p->is_refskip || p->is_del) continue;
+               baseQ = bam1_qual(p->b)[p->qpos];
+               if (baseQ < g->min_baseQ) continue;
+               seq = bam1_seq(p->b);
+               b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+               if (b > 3) continue;
+               q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+               if (q < 4) q = 4;
+               if (q > 63) q = 63;
+               g->bases[k++] = q<<5 | bam1_strand(p->b)<<4 | b;
+       }
+       if (k == 0) return 0;
+       errmod_cal(g->em, k, 4, g->bases, q);
+       for (i = 0; i < 4; ++i) sum[i] = (int)(q[i<<2|i] + .499) << 2 | i;
+       for (i = 1; i < 4; ++i) // insertion sort
+               for (j = i; j > 0 && sum[j] < sum[j-1]; --j)
+                       tmp = sum[j], sum[j] = sum[j-1], sum[j-1] = tmp;
+       qual = (sum[1]>>2) - (sum[0]>>2);
+       k = k < 256? k : 255;
+       ret = (qual < 63? qual : 63) << 2 | (sum[0]&3);
+       return ret<<8|k;
+}
+
+static void process_cns(bam_header_t *h, int tid, int l, uint16_t *cns)
+{
+       int i, f[2][2], *prev, *curr, *swap_tmp, s;
+       uint8_t *b; // backtrack array
+       b = calloc(l, 1);
+       f[0][0] = f[0][1] = 0;
+       prev = f[0]; curr = f[1];
+       // fill the backtrack matrix
+       for (i = 0; i < l; ++i) {
+               int c = (cns[i] == 0)? 0 : (cns[i]>>8 == 0)? 1 : 2;
+               int tmp0, tmp1;
+               // compute f[0]
+               tmp0 = prev[0] + g_param.e[0][c] + g_param.p[0][0]; // (s[i+1],s[i])=(0,0)
+               tmp1 = prev[1] + g_param.e[0][c] + g_param.p[1][0]; // (0,1)
+               if (tmp0 > tmp1) curr[0] = tmp0, b[i] = 0;
+               else curr[0] = tmp1, b[i] = 1;
+               // compute f[1]
+               tmp0 = prev[0] + g_param.e[1][c] + g_param.p[0][1]; // (s[i+1],s[i])=(1,0)
+               tmp1 = prev[1] + g_param.e[1][c] + g_param.p[1][1]; // (1,1)
+               if (tmp0 > tmp1) curr[1] = tmp0, b[i] |= 0<<1;
+               else curr[1] = tmp1, b[i] |= 1<<1;
+               // swap
+               swap_tmp = prev; prev = curr; curr = swap_tmp;
+       }
+       // backtrack
+       s = prev[0] > prev[1]? 0 : 1;
+       for (i = l - 1; i > 0; --i) {
+               b[i] |= s<<2;
+               s = b[i]>>s&1;
+       }
+       // print
+       for (i = 0, s = -1; i <= l; ++i) {
+               if (i == l || ((b[i]>>2&3) == 0 && s >= 0)) {
+                       if (s >= 0) {
+                               int j;
+                               printf("%s:%d-%d\t0\t%s\t%d\t60\t%dM\t*\t0\t0\t", h->target_name[tid], s+1, i, h->target_name[tid], s+1, i-s);
+                               for (j = s; j < i; ++j) {
+                                       int c = cns[j]>>8;
+                                       if (c == 0) putchar('N');
+                                       else putchar("ACGT"[c&3]);
+                               }
+                               putchar('\t');
+                               for (j = s; j < i; ++j)
+                                       putchar(33 + (cns[j]>>8>>2));
+                               putchar('\n');
+                       }
+                       //if (s >= 0) printf("%s\t%d\t%d\t%d\n", h->target_name[tid], s, i, i - s);
+                       s = -1;
+               } else if ((b[i]>>2&3) && s < 0) s = i;
+       }
+       free(b);
+}
+
+static int read_aln(void *data, bam1_t *b)
+{
+       extern int bam_prob_realn_core(bam1_t *b, const char *ref, int flag);
+       ct_t *g = (ct_t*)data;
+       int ret, len;
+       ret = bam_read1(g->fp, b);
+       if (ret >= 0 && g->fai && b->core.tid >= 0 && (b->core.flag&4) == 0) {
+               if (b->core.tid != g->tid) { // then load the sequence
+                       free(g->ref);
+                       g->ref = fai_fetch(g->fai, g->h->target_name[b->core.tid], &len);
+                       g->tid = b->core.tid;
+               }
+               bam_prob_realn_core(b, g->ref, 1<<1|1);
+       }
+       return ret;
+}
+
+int main_cut_target(int argc, char *argv[])
+{
+       int c, tid, pos, n, lasttid = -1, lastpos = -1, l, max_l;
+       const bam_pileup1_t *p;
+       bam_plp_t plp;
+       uint16_t *cns;
+       ct_t g;
+
+       memset(&g, 0, sizeof(ct_t));
+       g.min_baseQ = 13; g.tid = -1;
+       while ((c = getopt(argc, argv, "f:Q:i:o:0:1:2:")) >= 0) {
+               switch (c) {
+                       case 'Q': g.min_baseQ = atoi(optarg); break; // quality cutoff
+                       case 'i': g_param.p[0][1] = -atoi(optarg); break; // 0->1 transition (in) PENALTY
+                       case '0': g_param.e[1][0] = atoi(optarg); break; // emission SCORE
+                       case '1': g_param.e[1][1] = atoi(optarg); break;
+                       case '2': g_param.e[1][2] = atoi(optarg); break;
+                       case 'f': g.fai = fai_load(optarg);
+                               if (g.fai == 0) fprintf(stderr, "[%s] fail to load the fasta index.\n", __func__);
+                               break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage: samtools targetcut [-Q minQ] [-i inPen] [-0 em0] [-1 em1] [-2 em2] [-f ref] <in.bam>\n");
+               return 1;
+       }
+       l = max_l = 0; cns = 0;
+       g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       g.h = bam_header_read(g.fp);
+       g.em = errmod_init(1 - ERR_DEP);
+       plp = bam_plp_init(read_aln, &g);
+       while ((p = bam_plp_auto(plp, &tid, &pos, &n)) != 0) {
+               if (tid < 0) break;
+               if (tid != lasttid) { // change of chromosome
+                       if (cns) process_cns(g.h, lasttid, l, cns);
+                       if (max_l < g.h->target_len[tid]) {
+                               max_l = g.h->target_len[tid];
+                               kroundup32(max_l);
+                               cns = realloc(cns, max_l * 2);
+                       }
+                       l = g.h->target_len[tid];
+                       memset(cns, 0, max_l * 2);
+                       lasttid = tid;
+               }
+               cns[pos] = gencns(&g, n, p);
+               lastpos = pos;
+       }
+       process_cns(g.h, lasttid, l, cns);
+       free(cns);
+       bam_header_destroy(g.h);
+       bam_plp_destroy(plp);
+       bam_close(g.fp);
+       if (g.fai) {
+               fai_destroy(g.fai); free(g.ref);
+       }
+       errmod_destroy(g.em);
+       free(g.bases);
+       return 0;
+}
diff --git a/sam/misc/seqtk.c b/sam/misc/seqtk.c

new file mode 100644 (file)

index 0000000..591ddff
--- /dev/null
+++ b/sam/misc/seqtk.c
@@ -0,0 +1,783 @@
+#include <stdio.h>
+#include <ctype.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <zlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <limits.h>
+
+#include "kseq.h"
+KSEQ_INIT(gzFile, gzread)
+
+typedef struct {
+       int n, m;
+       uint64_t *a;
+} reglist_t;
+
+#include "khash.h"
+KHASH_MAP_INIT_STR(reg, reglist_t)
+
+typedef kh_reg_t reghash_t;
+
+reghash_t *stk_reg_read(const char *fn)
+{
+       reghash_t *h = kh_init(reg);
+       gzFile fp;
+       kstream_t *ks;
+       int dret;
+       kstring_t *str;
+       // read the list
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               int beg = -1, end = -1;
+               reglist_t *p;
+               khint_t k = kh_get(reg, h, str->s);
+               if (k == kh_end(h)) {
+                       int ret;
+                       char *s = strdup(str->s);
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (dret != '\n') {
+                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                               beg = atoi(str->s);
+                               if (dret != '\n') {
+                                       if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
+                                               end = atoi(str->s);
+                                               if (end < 0) end = -1;
+                                       }
+                               }
+                       }
+               }
+               // skip the rest of the line
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
+               if (beg < 0) beg = 0, end = INT_MAX;
+               if (p->n == p->m) {
+                       p->m = p->m? p->m<<1 : 4;
+                       p->a = realloc(p->a, p->m * 8);
+               }
+               p->a[p->n++] = (uint64_t)beg<<32 | end;
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       return h;
+}
+
+void stk_reg_destroy(reghash_t *h)
+{
+       khint_t k;
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+}
+
+/* constant table */
+
+unsigned char seq_nt16_table[256] = {
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15 /*'-'*/,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+       15, 1,14, 2, 13,15,15, 4, 11,15,15,12, 15, 3,15,15,
+       15,15, 5, 6,  8,15, 7, 9,  0,10,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15,
+       15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15
+};
+
+char *seq_nt16_rev_table = "XACMGRSVTWYHKDBN";
+unsigned char seq_nt16to4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+int bitcnt_table[] = { 4, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4 };
+
+/* composition */
+int stk_comp(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l, c, upper_only = 0;
+       reghash_t *h = 0;
+       reglist_t dummy;
+       while ((c = getopt(argc, argv, "ur:")) >= 0) {
+               switch (c) {
+                       case 'u': upper_only = 1; break;
+                       case 'r': h = stk_reg_read(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage:  seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
+               fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
+               return 1;
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       seq = kseq_init(fp);
+       dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
+       while ((l = kseq_read(seq)) >= 0) {
+               int i, k;
+               reglist_t *p = 0;
+               if (h) {
+                       khint_t k = kh_get(reg, h, seq->name.s);
+                       if (k != kh_end(h)) p = &kh_val(h, k);
+               } else {
+                       p = &dummy;
+                       dummy.a[0] = l;
+               }
+               for (k = 0; p && k < p->n; ++k) {
+                       int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
+                       int la, lb, lc, na, nb, nc, cnt[11];
+                       if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
+                       else la = 'a', lb = -1, lc = 0;
+                       na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+                       memset(cnt, 0, 11 * sizeof(int));
+                       for (i = beg; i < end; ++i) {
+                               int is_CpG = 0, a, b, c;
+                               a = na; b = nb; c = nc;
+                               na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
+                               if (b == 2 || b == 10) { // C or Y
+                                       if (nb == 4 || nb == 5) is_CpG = 1;
+                               } else if (b == 4 || b == 5) { // G or R
+                                       if (lb == 2 || lb == 10) is_CpG = 1;
+                               }
+                               if (upper_only == 0 || isupper(a)) {
+                                       if (c > 1) ++cnt[c+2];
+                                       if (c == 1) ++cnt[seq_nt16to4_table[b]];
+                                       if (b == 10 || b == 5) ++cnt[9];
+                                       else if (c == 2) {
+                                               ++cnt[8];
+                                       }
+                                       if (is_CpG) {
+                                               ++cnt[7];
+                                               if (b == 10 || b == 5) ++cnt[10];
+                                       }
+                               }
+                               la = a; lb = b; lc = c;
+                       }
+                       if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
+                       else printf("%s\t%d", seq->name.s, l);
+                       for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
+                       putchar('\n');
+               }
+               fflush(stdout);
+       }
+       free(dummy.a);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_randbase(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: seqtk randbase <in.fa>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               int i;
+               printf(">%s", seq->name.s);
+               for (i = 0; i < l; ++i) {
+                       int c, b, a, j, k, m;
+                       b = seq->seq.s[i];
+                       c = seq_nt16_table[b];
+                       a = bitcnt_table[c];
+                       if (a == 2) {
+                               m = (drand48() < 0.5);
+                               for (j = k = 0; j < 4; ++j) {
+                                       if ((1<<j & c) == 0) continue;
+                                       if (k == m) break;
+                                       ++k;
+                               }
+                               seq->seq.s[i] = islower(b)? "acgt"[j] : "ACGT"[j];
+                       }
+                       if (i%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[i]);
+               }
+               putchar('\n');
+       }
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_hety(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int l, c, win_size = 50000, n_start = 5, win_step, is_lower_mask = 0;
+       char *buf;
+       uint32_t cnt[3];
+       if (argc == 1) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   seqtk hety [options] <in.fa>\n\n");
+               fprintf(stderr, "Options: -w INT   window size [%d]\n", win_size);
+               fprintf(stderr, "         -t INT   # start positions in a window [%d]\n", n_start);
+               fprintf(stderr, "         -m       treat lowercases as masked\n");
+               fprintf(stderr, "\n");
+               return 1;
+       }
+       while ((c = getopt(argc, argv, "w:t:m")) >= 0) {
+               switch (c) {
+               case 'w': win_size = atoi(optarg); break;
+               case 't': n_start = atoi(optarg); break;
+               case 'm': is_lower_mask = 1; break;
+               }
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       seq = kseq_init(fp);
+       win_step = win_size / n_start;
+       buf = calloc(win_size, 1);
+       while ((l = kseq_read(seq)) >= 0) {
+               int x, i, y, z, next = 0;
+               cnt[0] = cnt[1] = cnt[2] = 0;
+               for (i = 0; i <= l; ++i) {
+                       if ((i >= win_size && i % win_step == 0) || i == l) {
+                               if (i == l && l >= win_size) {
+                                       for (y = l - win_size; y < next; ++y) --cnt[(int)buf[y % win_size]];
+                               }
+                               if (cnt[1] + cnt[2] > 0)
+                                       printf("%s\t%d\t%d\t%.2lf\t%d\t%d\n", seq->name.s, next, i,
+                                                  (double)cnt[2] / (cnt[1] + cnt[2]) * win_size, cnt[1] + cnt[2], cnt[2]);
+                               next = i;
+                       }
+                       if (i < l) {
+                               y = i % win_size;
+                               c = seq->seq.s[i];
+                               if (is_lower_mask && islower(c)) c = 'N';
+                               c = seq_nt16_table[c];
+                               x = bitcnt_table[c];
+                               if (i >= win_size) --cnt[(int)buf[y]];
+                               buf[y] = z = x > 2? 0 : x == 2? 2 : 1;
+                               ++cnt[z];
+                       }
+               }
+       }
+       free(buf);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+/* fq2fa */
+int stk_fq2fa(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       char *buf;
+       int l, i, c, qual_thres = 0, linelen = 60;
+       while ((c = getopt(argc, argv, "q:l:")) >= 0) {
+               switch (c) {
+                       case 'q': qual_thres = atoi(optarg); break;
+                       case 'l': linelen = atoi(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "Usage: seqtk fq2fa [-q qualThres=0] [-l lineLen=60] <in.fq>\n");
+               return 1;
+       }
+       buf = linelen > 0? malloc(linelen + 1) : 0;
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               if (seq->qual.l && qual_thres > 0) {
+                       for (i = 0; i < l; ++i)
+                               if (seq->qual.s[i] - 33 < qual_thres)
+                                       seq->seq.s[i] = tolower(seq->seq.s[i]);
+               }
+               putchar('>');
+               if (seq->comment.l) {
+                       fputs(seq->name.s, stdout);
+                       putchar(' ');
+                       puts(seq->comment.s);
+               } else puts(seq->name.s);
+               if (buf) { // multi-line
+                       for (i = 0; i < l; i += linelen) {
+                               int x = i + linelen < l? linelen : l - i;
+                               memcpy(buf, seq->seq.s + i, x);
+                               buf[x] = 0;
+                               puts(buf);
+                       }
+               } else puts(seq->seq.s);
+       }
+       free(buf);
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+int stk_maskseq(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       int l, i, j, c, is_complement = 0, is_lower = 0;
+       khint_t k;
+       while ((c = getopt(argc, argv, "cl")) >= 0) {
+               switch (c) {
+               case 'c': is_complement = 1; break;
+               case 'l': is_lower = 1; break;
+               }
+       }
+       if (argc - optind < 2) {
+               fprintf(stderr, "Usage:   seqtk maskseq [-cl] <in.fa> <in.bed>\n\n");
+               fprintf(stderr, "Options: -c     mask the complement regions\n");
+               fprintf(stderr, "         -l     soft mask (to lower cases)\n");
+               return 1;
+       }
+       h = stk_reg_read(argv[optind+1]);
+       // maskseq
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               k = kh_get(reg, h, seq->name.s);
+               if (k == kh_end(h)) { // not found in the hash table
+                       if (is_complement) {
+                               for (j = 0; j < l; ++j)
+                                       seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+                       }
+               } else {
+                       reglist_t *p = &kh_val(h, k);
+                       if (!is_complement) {
+                               for (i = 0; i < p->n; ++i) {
+                                       int beg = p->a[i]>>32, end = p->a[i];
+                                       if (beg >= seq->seq.l) {
+                                               fprintf(stderr, "[maskseq] start position >= the sequence length.\n");
+                                               continue;
+                                       }
+                                       if (end >= seq->seq.l) end = seq->seq.l;
+                                       if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
+                                       else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N';
+                               }
+                       } else {
+                               int8_t *mask = calloc(seq->seq.l, 1);
+                               for (i = 0; i < p->n; ++i) {
+                                       int beg = p->a[i]>>32, end = p->a[i];
+                                       if (end >= seq->seq.l) end = seq->seq.l;
+                                       for (j = beg; j < end; ++j) mask[j] = 1;
+                               }
+                               for (j = 0; j < l; ++j)
+                                       if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
+                               free(mask);
+                       }
+               }
+               printf(">%s", seq->name.s);
+               for (j = 0; j < seq->seq.l; ++j) {
+                       if (j%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[j]);
+               }
+               putchar('\n');
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       stk_reg_destroy(h);
+       return 0;
+}
+
+/* subseq */
+
+int stk_subseq(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       int l, i, j, c, is_tab = 0;
+       khint_t k;
+       while ((c = getopt(argc, argv, "t")) >= 0) {
+               switch (c) {
+               case 't': is_tab = 1; break;
+               }
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "Usage: seqtk subseq [-t] <in.fa> <in.bed>\n\n");
+               fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n");
+               return 1;
+       }
+       h = stk_reg_read(argv[optind+1]);
+       // subseq
+       fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               reglist_t *p;
+               k = kh_get(reg, h, seq->name.s);
+               if (k == kh_end(h)) continue;
+               p = &kh_val(h, k);
+               for (i = 0; i < p->n; ++i) {
+                       int beg = p->a[i]>>32, end = p->a[i];
+                       if (beg >= seq->seq.l) {
+                               fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
+                               continue;
+                       }
+                       if (end > seq->seq.l) end = seq->seq.l;
+                       if (is_tab == 0) {
+                               printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
+                               if (end == INT_MAX) {
+                                       if (beg) printf(":%d", beg+1);
+                               } else printf(":%d-%d", beg+1, end);
+                       } else printf("%s\t%d\t", seq->name.s, beg + 1);
+                       if (end > seq->seq.l) end = seq->seq.l;
+                       for (j = 0; j < end - beg; ++j) {
+                               if (is_tab == 0 && j % 60 == 0) putchar('\n');
+                               putchar(seq->seq.s[j + beg]);
+                       }
+                       putchar('\n');
+                       if (seq->qual.l != seq->seq.l || is_tab) continue;
+                       printf("+");
+                       for (j = 0; j < end - beg; ++j) {
+                               if (j % 60 == 0) putchar('\n');
+                               putchar(seq->qual.s[j + beg]);
+                       }
+                       putchar('\n');
+               }
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       stk_reg_destroy(h);
+       return 0;
+}
+
+/* mergefa */
+int stk_mergefa(int argc, char *argv[])
+{
+       gzFile fp[2];
+       kseq_t *seq[2];
+       int i, l, c, is_intersect = 0, is_haploid = 0, qual = 0, is_mask = 0;
+       while ((c = getopt(argc, argv, "himq:")) >= 0) {
+               switch (c) {
+                       case 'i': is_intersect = 1; break;
+                       case 'h': is_haploid = 1; break;
+                       case 'm': is_mask = 1; break;
+                       case 'q': qual = atoi(optarg); break;
+               }
+       }
+       if (is_mask && is_intersect) {
+               fprintf(stderr, "[%s] `-i' and `-h' cannot be applied at the same time.\n", __func__);
+               return 1;
+       }
+       if (optind + 2 > argc) {
+               fprintf(stderr, "\nUsage: seqtk mergefa [options] <in1.fa> <in2.fa>\n\n");
+               fprintf(stderr, "Options: -q INT   quality threshold [0]\n");
+               fprintf(stderr, "         -i       take intersection\n");
+               fprintf(stderr, "         -m       convert to lowercase when one of the input base is N.\n");
+               fprintf(stderr, "         -h       suppress hets in the input\n\n");
+               return 1;
+       }
+       for (i = 0; i < 2; ++i) {
+               fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+               seq[i] = kseq_init(fp[i]);
+       }
+       while (kseq_read(seq[0]) >= 0) {
+               int min_l, c[2], is_upper;
+               kseq_read(seq[1]);
+               if (strcmp(seq[0]->name.s, seq[1]->name.s))
+                       fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+               if (seq[0]->seq.l != seq[1]->seq.l)
+                       fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+               min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+               printf(">%s", seq[0]->name.s);
+               for (l = 0; l < min_l; ++l) {
+                       c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+                       if (seq[0]->qual.l && seq[0]->qual.s[l] - 33 < qual) c[0] = tolower(c[0]);
+                       if (seq[1]->qual.l && seq[1]->qual.s[l] - 33 < qual) c[1] = tolower(c[1]);
+                       if (is_intersect) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+                       else if (is_mask) is_upper = (isupper(c[0]) || isupper(c[1]))? 1 : 0;
+                       else is_upper = (isupper(c[0]) && isupper(c[1]))? 1 : 0;
+                       c[0] = seq_nt16_table[c[0]]; c[1] = seq_nt16_table[c[1]];
+                       if (c[0] == 0) c[0] = 15;
+                       if (c[1] == 0) c[1] = 15;
+                       if (is_haploid && (bitcnt_table[c[0]] > 1 || bitcnt_table[c[1]] > 1)) is_upper = 0;
+                       if (is_intersect) {
+                               c[0] = c[0] & c[1];
+                               if (c[0] == 0) is_upper = 0;
+                       } else if (is_mask) {
+                               if (c[0] == 15 || c[1] == 15) is_upper = 0;
+                               c[0] = c[0] & c[1];
+                               if (c[0] == 0) is_upper = 0;
+                       } else c[0] = c[0] | c[1];
+                       c[0] = seq_nt16_rev_table[c[0]];
+                       if (!is_upper) c[0] = tolower(c[0]);
+                       if (l%60 == 0) putchar('\n');
+                       putchar(c[0]);
+               }
+               putchar('\n');
+       }
+       return 0;
+}
+
+int stk_famask(int argc, char *argv[])
+{
+       gzFile fp[2];
+       kseq_t *seq[2];
+       int i, l;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: seqtk famask <src.fa> <mask.fa>\n");
+               return 1;
+       }
+       for (i = 0; i < 2; ++i) {
+               fp[i] = strcmp(argv[optind+i], "-")? gzopen(argv[optind+i], "r") : gzdopen(fileno(stdin), "r");
+               seq[i] = kseq_init(fp[i]);
+       }
+       while (kseq_read(seq[0]) >= 0) {
+               int min_l, c[2];
+               kseq_read(seq[1]);
+               if (strcmp(seq[0]->name.s, seq[1]->name.s))
+                       fprintf(stderr, "[%s] Different sequence names: %s != %s\n", __func__, seq[0]->name.s, seq[1]->name.s);
+               if (seq[0]->seq.l != seq[1]->seq.l)
+                       fprintf(stderr, "[%s] Unequal sequence length: %ld != %ld\n", __func__, seq[0]->seq.l, seq[1]->seq.l);
+               min_l = seq[0]->seq.l < seq[1]->seq.l? seq[0]->seq.l : seq[1]->seq.l;
+               printf(">%s", seq[0]->name.s);
+               for (l = 0; l < min_l; ++l) {
+                       c[0] = seq[0]->seq.s[l]; c[1] = seq[1]->seq.s[l];
+                       if (c[1] == 'x') c[0] = tolower(c[0]);
+                       else if (c[1] != 'X') c[0] = c[1];
+                       if (l%60 == 0) putchar('\n');
+                       putchar(c[0]);
+               }
+               putchar('\n');
+       }
+       return 0;
+}
+
+int stk_mutfa(int argc, char *argv[])
+{
+       khash_t(reg) *h = kh_init(reg);
+       gzFile fp;
+       kseq_t *seq;
+       kstream_t *ks;
+       int l, i, dret;
+       kstring_t *str;
+       khint_t k;
+       if (argc < 3) {
+               fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
+               fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
+               fprintf(stderr, "      'chr  1-based-pos  any  base-changed-to'.\n");
+               return 1;
+       }
+       // read the list
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               char *s = strdup(str->s);
+               int beg = 0, ret;
+               reglist_t *p;
+               k = kh_get(reg, h, s);
+               if (k == kh_end(h)) {
+                       k = kh_put(reg, h, s, &ret);
+                       memset(&kh_val(h, k), 0, sizeof(reglist_t));
+               }
+               p = &kh_val(h, k);
+               if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
+               ks_getuntil(ks, 0, str, &dret); // 3rd col
+               ks_getuntil(ks, 0, str, &dret); // 4th col
+               // skip the rest of the line
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (isalpha(str->s[0]) && str->l == 1) {
+                       if (p->n == p->m) {
+                               p->m = p->m? p->m<<1 : 4;
+                               p->a = realloc(p->a, p->m * 8);
+                       }
+                       p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
+               }
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       // mutfa
+       fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               reglist_t *p;
+               k = kh_get(reg, h, seq->name.s);
+               if (k != kh_end(h)) {
+                       p = &kh_val(h, k);
+                       for (i = 0; i < p->n; ++i) {
+                               int beg = p->a[i]>>32;
+                               if (beg < seq->seq.l)
+                                       seq->seq.s[beg] = (int)p->a[i];
+                       }
+               }
+               printf(">%s", seq->name.s);
+               for (i = 0; i < l; ++i) {
+                       if (i%60 == 0) putchar('\n');
+                       putchar(seq->seq.s[i]);
+               }
+               putchar('\n');
+       }
+       // free
+       kseq_destroy(seq);
+       gzclose(fp);
+       for (k = 0; k < kh_end(h); ++k) {
+               if (kh_exist(h, k)) {
+                       free(kh_val(h, k).a);
+                       free((char*)kh_key(h, k));
+               }
+       }
+       kh_destroy(reg, h);
+       return 0;
+}
+
+int stk_listhet(int argc, char *argv[])
+{
+       gzFile fp;
+       kseq_t *seq;
+       int i, l;
+       if (argc == 1) {
+               fprintf(stderr, "Usage: seqtk listhet <in.fa>\n");
+               return 1;
+       }
+       fp = (strcmp(argv[1], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[1], "r");
+       seq = kseq_init(fp);
+       while ((l = kseq_read(seq)) >= 0) {
+               for (i = 0; i < l; ++i) {
+                       int b = seq->seq.s[i];
+                       if (bitcnt_table[seq_nt16_table[b]] == 2)
+                               printf("%s\t%d\t%c\n", seq->name.s, i+1, b);
+               }
+       }
+       kseq_destroy(seq);
+       gzclose(fp);
+       return 0;
+}
+
+/* cutN */
+static int cutN_min_N_tract = 1000;
+static int cutN_nonN_penalty = 10;
+
+static int find_next_cut(const kseq_t *ks, int k, int *begin, int *end)
+{
+       int i, b, e;
+       while (k < ks->seq.l) {
+               if (seq_nt16_table[(int)ks->seq.s[k]] == 15) {
+                       int score, max;
+                       score = 0; e = max = -1;
+                       for (i = k; i < ks->seq.l && score >= 0; ++i) { /* forward */
+                               if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+                               else score -= cutN_nonN_penalty;
+                               if (score > max) max = score, e = i;
+                       }
+                       score = 0; b = max = -1;
+                       for (i = e; i >= 0 && score >= 0; --i) { /* backward */
+                               if (seq_nt16_table[(int)ks->seq.s[i]] == 15) ++score;
+                               else score -= cutN_nonN_penalty;
+                               if (score > max) max = score, b = i;
+                       }
+                       if (e + 1 - b >= cutN_min_N_tract) {
+                               *begin = b;
+                               *end = e + 1;
+                               return *end;
+                       }
+                       k = e + 1;
+               } else ++k;
+       }
+       return -1;
+}
+static void print_seq(FILE *fpout, const kseq_t *ks, int begin, int end)
+{
+       int i;
+       if (begin >= end) return; // FIXME: why may this happen? Understand it!
+       fprintf(fpout, ">%s:%d-%d", ks->name.s, begin+1, end);
+       for (i = begin; i < end && i < ks->seq.l; ++i) {
+               if ((i - begin)%60 == 0) fputc('\n', fpout);
+               fputc(ks->seq.s[i], fpout);
+       }
+       fputc('\n', fpout);
+}
+int stk_cutN(int argc, char *argv[])
+{
+       int c, l, gap_only = 0;
+       gzFile fp;
+       kseq_t *ks;
+       while ((c = getopt(argc, argv, "n:p:g")) >= 0) {
+               switch (c) {
+               case 'n': cutN_min_N_tract = atoi(optarg); break;
+               case 'p': cutN_nonN_penalty = atoi(optarg); break;
+               case 'g': gap_only = 1; break;
+               default: return 1;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   seqtk cutN [options] <in.fa>\n\n");
+               fprintf(stderr, "Options: -n INT    min size of N tract [%d]\n", cutN_min_N_tract);
+               fprintf(stderr, "         -p INT    penalty for a non-N [%d]\n", cutN_nonN_penalty);
+               fprintf(stderr, "         -g        print gaps only, no sequence\n\n");
+               return 1;
+       }
+       fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
+       ks = kseq_init(fp);
+       while ((l = kseq_read(ks)) >= 0) {
+               int k = 0, begin = 0, end = 0;
+               while (find_next_cut(ks, k, &begin, &end) >= 0) {
+                       if (begin != 0) {
+                               if (gap_only) printf("%s\t%d\t%d\n", ks->name.s, begin, end);
+                               else print_seq(stdout, ks, k, begin);
+                       }
+                       k = end;
+               }
+               if (!gap_only) print_seq(stdout, ks, k, l);
+       }
+       kseq_destroy(ks);
+       gzclose(fp);
+       return 0;
+}
+
+/* main function */
+static int usage()
+{
+       fprintf(stderr, "\n");
+       fprintf(stderr, "Usage:   seqtk <command> <arguments>\n\n");
+       fprintf(stderr, "Command: comp      get the nucleotide composite of FASTA/Q\n");
+       fprintf(stderr, "         hety      regional heterozygosity\n");
+       fprintf(stderr, "         fq2fa     convert FASTQ to FASTA\n");
+       fprintf(stderr, "         subseq    extract subsequences from FASTA/Q\n");
+       fprintf(stderr, "         maskseq   mask sequences\n");
+       fprintf(stderr, "         mutfa     point mutate FASTA at specified positions\n");
+       fprintf(stderr, "         mergefa   merge two FASTA/Q files\n");
+       fprintf(stderr, "         randbase  choose a random base from hets\n");
+       fprintf(stderr, "         cutN      cut sequence at long N\n");
+       fprintf(stderr, "         listhet   extract the position of each het\n");
+       fprintf(stderr, "\n");
+       return 1;
+}
+
+int main(int argc, char *argv[])
+{
+       if (argc == 1) return usage();
+       if (strcmp(argv[1], "comp") == 0) stk_comp(argc-1, argv+1);
+       else if (strcmp(argv[1], "hety") == 0) stk_hety(argc-1, argv+1);
+       else if (strcmp(argv[1], "fq2fa") == 0) stk_fq2fa(argc-1, argv+1);
+       else if (strcmp(argv[1], "subseq") == 0) stk_subseq(argc-1, argv+1);
+       else if (strcmp(argv[1], "maskseq") == 0) stk_maskseq(argc-1, argv+1);
+       else if (strcmp(argv[1], "mutfa") == 0) stk_mutfa(argc-1, argv+1);
+       else if (strcmp(argv[1], "mergefa") == 0) stk_mergefa(argc-1, argv+1);
+       else if (strcmp(argv[1], "randbase") == 0) stk_randbase(argc-1, argv+1);
+       else if (strcmp(argv[1], "cutN") == 0) stk_cutN(argc-1, argv+1);
+       else if (strcmp(argv[1], "listhet") == 0) stk_listhet(argc-1, argv+1);
+       else if (strcmp(argv[1], "famask") == 0) stk_famask(argc-1, argv+1);
+       else {
+               fprintf(stderr, "[main] unrecognized commad '%s'. Abort!\n", argv[1]);
+               return 1;
+       }
+       return 0;
+}
diff --git a/sam/phase.c b/sam/phase.c

new file mode 100644 (file)

index 0000000..ef4eff9
--- /dev/null
+++ b/sam/phase.c
@@ -0,0 +1,687 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <math.h>
+#include <zlib.h>
+#include "bam.h"
+#include "errmod.h"
+
+#include "kseq.h"
+KSTREAM_INIT(gzFile, gzread, 16384)
+
+#define MAX_VARS 256
+#define FLIP_PENALTY 2
+#define FLIP_THRES 4
+#define MASK_THRES 3
+
+#define FLAG_FIX_CHIMERA 0x1
+#define FLAG_LIST_EXCL   0x4
+#define FLAG_DROP_AMBI   0x8
+
+typedef struct {
+       // configurations, initialized in the main function
+       int flag, k, min_baseQ, min_varLOD, max_depth;
+       // other global variables
+       int vpos_shift;
+       bamFile fp;
+       char *pre;
+       bamFile out[3];
+       // alignment queue
+       int n, m;
+       bam1_t **b;
+} phaseg_t;
+
+typedef struct {
+       int8_t seq[MAX_VARS]; // TODO: change to dynamic memory allocation!
+       int vpos, beg, end;
+       uint32_t vlen:16, single:1, flip:1, phase:1, phased:1, ambig:1;
+       uint32_t in:16, out:16; // in-phase and out-phase
+} frag_t, *frag_p;
+
+#define rseq_lt(a,b) ((a)->vpos < (b)->vpos)
+
+#include "khash.h"
+KHASH_SET_INIT_INT64(set64)
+KHASH_MAP_INIT_INT64(64, frag_t)
+
+typedef khash_t(64) nseq_t;
+
+#include "ksort.h"
+KSORT_INIT(rseq, frag_p, rseq_lt)
+
+static char nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 };
+
+static inline uint64_t X31_hash_string(const char *s)
+{
+       uint64_t h = *s;
+       if (h) for (++s ; *s; ++s) h = (h << 5) - h + *s;
+       return h;
+}
+
+static void count1(int l, const uint8_t *seq, int *cnt)
+{
+       int i, j, n_ambi;
+       uint32_t z, x;
+       if (seq[l-1] == 0) return; // do nothing is the last base is ambiguous
+       for (i = n_ambi = 0; i < l; ++i) // collect ambiguous bases
+               if (seq[i] == 0) ++n_ambi;
+       if (l - n_ambi <= 1) return; // only one SNP
+       for (x = 0; x < 1u<<n_ambi; ++x) { // count
+               for (i = j = 0, z = 0; i < l; ++i) {
+                       int c;
+                       if (seq[i]) c = seq[i] - 1;
+                       else {
+                               c = x>>j&1;
+                               ++j;
+                       }
+                       z = z<<1 | c;
+               }
+               ++cnt[z];
+       }
+}
+
+static int **count_all(int l, int vpos, nseq_t *hash)
+{
+       khint_t k;
+       int i, j, **cnt;
+       uint8_t *seq;
+       seq = calloc(l, 1);
+       cnt = calloc(vpos, sizeof(void*));
+       for (i = 0; i < vpos; ++i) cnt[i] = calloc(1<<l, sizeof(int));
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos >= vpos || f->single) continue; // out of region; or singleton
+                       if (f->vlen == 1) { // such reads should be flagged as deleted previously if everything is right
+                               f->single = 1;
+                               continue;
+                       }
+                       for (j = 1; j < f->vlen; ++j) {
+                               for (i = 0; i < l; ++i)
+                                       seq[i] = j < l - 1 - i? 0 : f->seq[j - (l - 1 - i)];
+                               count1(l, seq, cnt[f->vpos + j]);
+                       }
+               }
+       }
+       free(seq);
+       return cnt;
+}
+
+// phasing
+static int8_t *dynaprog(int l, int vpos, int **w)
+{
+       int *f[2], *curr, *prev, max, i;
+       int8_t **b, *h = 0;
+       uint32_t x, z = 1u<<(l-1), mask = (1u<<l) - 1;
+       f[0] = calloc(z, sizeof(int));
+       f[1] = calloc(z, sizeof(int));
+       b = calloc(vpos, sizeof(void*));
+       prev = f[0]; curr = f[1];
+       // fill the backtrack matrix
+       for (i = 0; i < vpos; ++i) {
+               int *wi = w[i], *tmp;
+               int8_t *bi;
+               bi = b[i] = calloc(z, 1);
+               /* In the following, x is the current state, which is the
+                * lexicographically smaller local haplotype. xc is the complement of
+                * x, or the larger local haplotype; y0 and y1 are the two predecessors
+                * of x. */
+               for (x = 0; x < z; ++x) { // x0 is the smaller 
+                       uint32_t y0, y1, xc;
+                       int c0, c1;
+                       xc = ~x&mask; y0 = x>>1; y1 = xc>>1;
+                       c0 = prev[y0] + wi[x] + wi[xc];
+                       c1 = prev[y1] + wi[x] + wi[xc];
+                       if (c0 > c1) bi[x] = 0, curr[x] = c0;
+                       else bi[x] = 1, curr[x] = c1;
+               }
+               tmp = prev; prev = curr; curr = tmp; // swap
+       }
+       { // backtrack
+               uint32_t max_x = 0;
+               int which = 0;
+               h = calloc(vpos, 1);
+               for (x = 0, max = 0, max_x = 0; x < z; ++x)
+                       if (prev[x] > max) max = prev[x], max_x = x;
+               for (i = vpos - 1, x = max_x; i >= 0; --i) {
+                       h[i] = which? (~x&1) : (x&1);
+                       which = b[i][x]? !which : which;
+                       x = b[i][x]? (~x&mask)>>1 : x>>1;
+               }
+       }
+       // free
+       for (i = 0; i < vpos; ++i) free(b[i]);
+       free(f[0]); free(f[1]); free(b);
+       return h;
+}
+
+// phase each fragment
+static uint64_t *fragphase(int vpos, const int8_t *path, nseq_t *hash, int flip)
+{
+       khint_t k;
+       uint64_t *pcnt;
+       uint32_t *left, *rght, max;
+       left = rght = 0; max = 0;
+       pcnt = calloc(vpos, 8);
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       int i, c[2];
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos >= vpos) continue;
+                       // get the phase
+                       c[0] = c[1] = 0;
+                       for (i = 0; i < f->vlen; ++i) {
+                               if (f->seq[i] == 0) continue;
+                               ++c[f->seq[i] == path[f->vpos + i] + 1? 0 : 1];
+                       }
+                       f->phase = c[0] > c[1]? 0 : 1;
+                       f->in = c[f->phase]; f->out = c[1 - f->phase];
+                       f->phased = f->in == f->out? 0 : 1;
+                       f->ambig = (f->in && f->out && f->out < 3 && f->in <= f->out + 1)? 1 : 0;
+                       // fix chimera
+                       f->flip = 0;
+                       if (flip && c[0] >= 3 && c[1] >= 3) {
+                               int sum[2], m, mi, md;
+                               if (f->vlen > max) { // enlarge the array
+                                       max = f->vlen;
+                                       kroundup32(max);
+                                       left = realloc(left, max * 4);
+                                       rght = realloc(rght, max * 4);
+                               }
+                               for (i = 0, sum[0] = sum[1] = 0; i < f->vlen; ++i) { // get left counts
+                                       if (f->seq[i]) {
+                                               int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                               ++sum[c == path[f->vpos + i]? 0 : 1];
+                                       }
+                                       left[i] = sum[1]<<16 | sum[0];
+                               }
+                               for (i = f->vlen - 1, sum[0] = sum[1] = 0; i >= 0; --i) { // get right counts
+                                       if (f->seq[i]) {
+                                               int c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                               ++sum[c == path[f->vpos + i]? 0 : 1];
+                                       }
+                                       rght[i] = sum[1]<<16 | sum[0];
+                               }
+                               // find the best flip point
+                               for (i = m = 0, mi = -1, md = -1; i < f->vlen - 1; ++i) {
+                                       int a[2];
+                                       a[0] = (left[i]&0xffff) + (rght[i+1]>>16&0xffff) - (rght[i+1]&0xffff) * FLIP_PENALTY;
+                                       a[1] = (left[i]>>16&0xffff) + (rght[i+1]&0xffff) - (rght[i+1]>>16&0xffff) * FLIP_PENALTY;
+                                       if (a[0] > a[1]) {
+                                               if (a[0] > m) m = a[0], md = 0, mi = i;
+                                       } else {
+                                               if (a[1] > m) m = a[1], md = 1, mi = i;
+                                       }
+                               }
+                               if (m - c[0] >= FLIP_THRES && m - c[1] >= FLIP_THRES) { // then flip
+                                       f->flip = 1;
+                                       if (md == 0) { // flip the tail
+                                               for (i = mi + 1; i < f->vlen; ++i)
+                                                       if (f->seq[i] == 1) f->seq[i] = 2;
+                                                       else if (f->seq[i] == 2) f->seq[i] = 1;
+                                       } else { // flip the head
+                                               for (i = 0; i <= mi; ++i)
+                                                       if (f->seq[i] == 1) f->seq[i] = 2;
+                                                       else if (f->seq[i] == 2) f->seq[i] = 1;
+                                       }
+                               }
+                       }
+                       // update pcnt[]
+                       if (!f->single) {
+                               for (i = 0; i < f->vlen; ++i) {
+                                       int c;
+                                       if (f->seq[i] == 0) continue;
+                                       c = f->phase? 2 - f->seq[i] : f->seq[i] - 1;
+                                       if (c == path[f->vpos + i]) {
+                                               if (f->phase == 0) ++pcnt[f->vpos + i];
+                                               else pcnt[f->vpos + i] += 1ull<<32;
+                                       } else {
+                                               if (f->phase == 0) pcnt[f->vpos + i] += 1<<16;
+                                               else pcnt[f->vpos + i] += 1ull<<48;
+                                       }
+                               }
+                       }
+               }
+       }
+       free(left); free(rght);
+       return pcnt;
+}
+
+static uint64_t *genmask(int vpos, const uint64_t *pcnt, int *_n)
+{
+       int i, max = 0, max_i = -1, m = 0, n = 0, beg = 0, score = 0;
+       uint64_t *list = 0;
+       for (i = 0; i < vpos; ++i) {
+               uint64_t x = pcnt[i];
+               int c[4], pre = score, s;
+               c[0] = x&0xffff; c[1] = x>>16&0xffff; c[2] = x>>32&0xffff; c[3] = x>>48&0xffff;
+               s = (c[1] + c[3] == 0)? -(c[0] + c[2]) : (c[1] + c[3] - 1);
+               if (c[3] > c[2]) s += c[3] - c[2];
+               if (c[1] > c[0]) s += c[1] - c[0];
+               score += s;
+               if (score < 0) score = 0;
+               if (pre == 0 && score > 0) beg = i; // change from zero to non-zero
+               if ((i == vpos - 1 || score == 0) && max >= MASK_THRES) {
+                       if (n == m) {
+                               m = m? m<<1 : 4;
+                               list = realloc(list, m * 8);
+                       }
+                       list[n++] = (uint64_t)beg<<32 | max_i;
+                       i = max_i; // reset i to max_i
+                       score = 0;
+               } else if (score > max) max = score, max_i = i;
+               if (score == 0) max = 0;
+       }
+       *_n = n;
+       return list;
+}
+
+// trim heading and tailing ambiguous bases; mark deleted and remove sequence
+static int clean_seqs(int vpos, nseq_t *hash)
+{
+       khint_t k;
+       int ret = 0;
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       int beg, end, i;
+                       if (f->vpos >= vpos) {
+                               ret = 1;
+                               continue;
+                       }
+                       for (i = 0; i < f->vlen; ++i)
+                               if (f->seq[i] != 0) break;
+                       beg = i;
+                       for (i = f->vlen - 1; i >= 0; --i)
+                               if (f->seq[i] != 0) break;
+                       end = i + 1;
+                       if (end - beg <= 0) kh_del(64, hash, k);
+                       else {
+                               if (beg != 0) memmove(f->seq, f->seq + beg, end - beg);
+                               f->vpos += beg; f->vlen = end - beg;
+                               f->single = f->vlen == 1? 1 : 0;
+                       }
+               }
+       }
+       return ret;
+}
+
+static void dump_aln(phaseg_t *g, int min_pos, const nseq_t *hash)
+{
+       int i, is_flip, drop_ambi;
+       drop_ambi = g->flag & FLAG_DROP_AMBI;
+       is_flip = (drand48() < 0.5);
+       for (i = 0; i < g->n; ++i) {
+               int end, which;
+               uint64_t key;
+               khint_t k;
+               bam1_t *b = g->b[i];
+               key = X31_hash_string(bam1_qname(b));
+               end = bam_calend(&b->core, bam1_cigar(b));
+               if (end > min_pos) break;
+               k = kh_get(64, hash, key);
+               if (k == kh_end(hash)) which = 3;
+               else {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->ambig) which = drop_ambi? 2 : 3;
+                       else if (f->phased && f->flip) which = 2;
+                       else if (f->phased == 0) which = 3;
+                       else { // phased and not flipped
+                               char c = 'Y';
+                               which = f->phase;
+                               bam_aux_append(b, "ZP", 'A', 1, (uint8_t*)&c);
+                       }
+                       if (which < 2 && is_flip) which = 1 - which; // increase the randomness
+               }
+               if (which == 3) which = (drand48() < 0.5);
+               bam_write1(g->out[which], b);
+               bam_destroy1(b);
+               g->b[i] = 0;
+       }
+       memmove(g->b, g->b + i, (g->n - i) * sizeof(void*));
+       g->n -= i;
+}
+
+static int phase(phaseg_t *g, const char *chr, int vpos, uint64_t *cns, nseq_t *hash)
+{
+       int i, j, n_seqs = kh_size(hash), n_masked = 0, min_pos;
+       khint_t k;
+       frag_t **seqs;
+       int8_t *path, *sitemask;
+       uint64_t *pcnt, *regmask;
+
+       if (vpos == 0) return 0;
+       i = clean_seqs(vpos, hash); // i is true if hash has an element with its vpos >= vpos
+       min_pos = i? cns[vpos]>>32 : 0x7fffffff;
+       if (vpos == 1) {
+               printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1);
+               printf("M0\t%s\t%d\t%d\t%c\t%c\t%d\t0\t0\t0\t0\n//\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[0]>>32) + 1,
+                       "ACGTX"[cns[0]&3], "ACGTX"[cns[0]>>16&3], g->vpos_shift + 1);
+               for (k = 0; k < kh_end(hash); ++k) {
+                       if (kh_exist(hash, k)) {
+                               frag_t *f = &kh_val(hash, k);
+                               if (f->vpos) continue;
+                               f->flip = 0;
+                               if (f->seq[0] == 0) f->phased = 0;
+                               else f->phased = 1, f->phase = f->seq[0] - 1;
+                       }
+               }
+               dump_aln(g, min_pos, hash);
+               ++g->vpos_shift;
+               return 1;
+       }
+       { // phase
+               int **cnt;
+               uint64_t *mask;
+               printf("PS\t%s\t%d\t%d\n", chr, (int)(cns[0]>>32) + 1, (int)(cns[vpos-1]>>32) + 1);
+               sitemask = calloc(vpos, 1);
+               cnt = count_all(g->k, vpos, hash);
+               path = dynaprog(g->k, vpos, cnt);
+               for (i = 0; i < vpos; ++i) free(cnt[i]);
+               free(cnt);
+               pcnt = fragphase(vpos, path, hash, 0); // do not fix chimeras when masking
+               mask = genmask(vpos, pcnt, &n_masked);
+               regmask = calloc(n_masked, 8);
+               for (i = 0; i < n_masked; ++i) {
+                       regmask[i] = cns[mask[i]>>32]>>32<<32 | cns[(uint32_t)mask[i]]>>32;
+                       for (j = mask[i]>>32; j <= (int32_t)mask[i]; ++j)
+                               sitemask[j] = 1;
+               }
+               free(mask);
+               if (g->flag & FLAG_FIX_CHIMERA) {
+                       free(pcnt);
+                       pcnt = fragphase(vpos, path, hash, 1);
+               }
+       }
+       for (i = 0; i < n_masked; ++i)
+               printf("FL\t%s\t%d\t%d\n", chr, (int)(regmask[i]>>32) + 1, (int)regmask[i] + 1);
+       for (i = 0; i < vpos; ++i) {
+               uint64_t x = pcnt[i];
+               int8_t c[2];
+               c[0] = (cns[i]&0xffff)>>2 == 0? 4 : (cns[i]&3);
+               c[1] = (cns[i]>>16&0xffff)>>2 == 0? 4 : (cns[i]>>16&3);
+               printf("M%d\t%s\t%d\t%d\t%c\t%c\t%d\t%d\t%d\t%d\t%d\n", sitemask[i]+1, chr, (int)(cns[0]>>32) + 1, (int)(cns[i]>>32) + 1, "ACGTX"[c[path[i]]], "ACGTX"[c[1-path[i]]],
+                       i + g->vpos_shift + 1, (int)(x&0xffff), (int)(x>>16&0xffff), (int)(x>>32&0xffff), (int)(x>>48&0xffff));
+       }
+       free(path); free(pcnt); free(regmask); free(sitemask);
+       seqs = calloc(n_seqs, sizeof(void*));
+       for (k = 0, i = 0; k < kh_end(hash); ++k) 
+               if (kh_exist(hash, k) && kh_val(hash, k).vpos < vpos && !kh_val(hash, k).single)
+                       seqs[i++] = &kh_val(hash, k);
+       n_seqs = i;
+       ks_introsort_rseq(n_seqs, seqs);
+       for (i = 0; i < n_seqs; ++i) {
+               frag_t *f = seqs[i];
+               printf("EV\t0\t%s\t%d\t40\t%dM\t*\t0\t0\t", chr, f->vpos + 1 + g->vpos_shift, f->vlen);
+               for (j = 0; j < f->vlen; ++j) {
+                       uint32_t c = cns[f->vpos + j];
+                       if (f->seq[j] == 0) putchar('N');
+                       else putchar("ACGT"[f->seq[j] == 1? (c&3) : (c>>16&3)]);
+               }
+               printf("\t*\tYP:i:%d\tYF:i:%d\tYI:i:%d\tYO:i:%d\tYS:i:%d\n", f->phase, f->flip, f->in, f->out, f->beg+1);
+       }
+       free(seqs);
+       printf("//\n");
+       fflush(stdout);
+       g->vpos_shift += vpos;
+       dump_aln(g, min_pos, hash);
+       return vpos;
+}
+
+static void update_vpos(int vpos, nseq_t *hash)
+{
+       khint_t k;
+       for (k = 0; k < kh_end(hash); ++k) {
+               if (kh_exist(hash, k)) {
+                       frag_t *f = &kh_val(hash, k);
+                       if (f->vpos < vpos) kh_del(64, hash, k); // TODO: if frag_t::seq is allocated dynamically, free it
+                       else f->vpos -= vpos;
+               }
+       }
+}
+
+static nseq_t *shrink_hash(nseq_t *hash) // TODO: to implement
+{
+       return hash;
+}
+
+static int readaln(void *data, bam1_t *b)
+{
+       phaseg_t *g = (phaseg_t*)data;
+       int ret;
+       ret = bam_read1(g->fp, b);
+       if (ret < 0) return ret;
+       if (!(b->core.flag & (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP)) && g->pre) {
+               if (g->n == g->m) {
+                       g->m = g->m? g->m<<1 : 16;
+                       g->b = realloc(g->b, g->m * sizeof(void*));
+               }
+               g->b[g->n++] = bam_dup1(b);
+       }
+       return ret;
+}
+
+static khash_t(set64) *loadpos(const char *fn, bam_header_t *h)
+{
+       gzFile fp;
+       kstream_t *ks;
+       int ret, dret;
+       kstring_t *str;
+       khash_t(set64) *hash;
+
+       hash = kh_init(set64);
+       str = calloc(1, sizeof(kstring_t));
+       fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
+       ks = ks_init(fp);
+       while (ks_getuntil(ks, 0, str, &dret) >= 0) {
+               int tid = bam_get_tid(h, str->s);
+               if (tid >= 0 && dret != '\n') {
+                       if (ks_getuntil(ks, 0, str, &dret) >= 0) {
+                               uint64_t x = (uint64_t)tid<<32 | (atoi(str->s) - 1);
+                               kh_put(set64, hash, x, &ret);
+                       } else break;
+               }
+               if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
+               if (dret < 0) break;
+       }
+       ks_destroy(ks);
+       gzclose(fp);
+       free(str->s); free(str);
+       return hash;
+}
+
+static int gl2cns(float q[16])
+{
+       int i, j, min_ij;
+       float min, min2;
+       min = min2 = 1e30; min_ij = -1;
+       for (i = 0; i < 4; ++i) {
+               for (j = i; j < 4; ++j) {
+                       if (q[i<<2|j] < min) min_ij = i<<2|j, min2 = min, min = q[i<<2|j];
+                       else if (q[i<<2|j] < min2) min2 = q[i<<2|j];
+               }
+       }
+       return (min_ij>>2&3) == (min_ij&3)? 0 : 1<<18 | (min_ij>>2&3)<<16 | (min_ij&3) | (int)(min2 - min + .499) << 2;
+}
+
+int main_phase(int argc, char *argv[])
+{
+       extern void bam_init_header_hash(bam_header_t *header);
+       int c, tid, pos, vpos = 0, n, lasttid = -1, max_vpos = 0;
+       const bam_pileup1_t *plp;
+       bam_plp_t iter;
+       bam_header_t *h;
+       nseq_t *seqs;
+       uint64_t *cns = 0;
+       phaseg_t g;
+       char *fn_list = 0;
+       khash_t(set64) *set = 0;
+       errmod_t *em;
+       uint16_t *bases;
+
+       memset(&g, 0, sizeof(phaseg_t));
+       g.flag = FLAG_FIX_CHIMERA;
+       g.min_varLOD = 37; g.k = 13; g.min_baseQ = 13; g.max_depth = 256;
+       while ((c = getopt(argc, argv, "Q:eFq:k:b:l:D:A:")) >= 0) {
+               switch (c) {
+                       case 'D': g.max_depth = atoi(optarg); break;
+                       case 'q': g.min_varLOD = atoi(optarg); break;
+                       case 'Q': g.min_baseQ = atoi(optarg); break;
+                       case 'k': g.k = atoi(optarg); break;
+                       case 'F': g.flag &= ~FLAG_FIX_CHIMERA; break;
+                       case 'e': g.flag |= FLAG_LIST_EXCL; break;
+                       case 'A': g.flag |= FLAG_DROP_AMBI; break;
+                       case 'b': g.pre = strdup(optarg); break;
+                       case 'l': fn_list = strdup(optarg); break;
+               }
+       }
+       if (argc == optind) {
+               fprintf(stderr, "\n");
+               fprintf(stderr, "Usage:   samtools phase [options] <in.bam>\n\n");
+               fprintf(stderr, "Options: -k INT    block length [%d]\n", g.k);
+               fprintf(stderr, "         -b STR    prefix of BAMs to output [null]\n");
+               fprintf(stderr, "         -q INT    min het phred-LOD [%d]\n", g.min_varLOD);
+               fprintf(stderr, "         -Q INT    min base quality in het calling [%d]\n", g.min_baseQ);
+               fprintf(stderr, "         -D INT    max read depth [%d]\n", g.max_depth);
+//             fprintf(stderr, "         -l FILE   list of sites to phase [null]\n");
+               fprintf(stderr, "         -F        do not attempt to fix chimeras\n");
+               fprintf(stderr, "         -A        drop reads with ambiguous phase\n");
+//             fprintf(stderr, "         -e        do not discover SNPs (effective with -l)\n");
+               fprintf(stderr, "\n");
+               return 1;
+       }
+       g.fp = strcmp(argv[optind], "-")? bam_open(argv[optind], "r") : bam_dopen(fileno(stdin), "r");
+       h = bam_header_read(g.fp);
+       if (fn_list) { // read the list of sites to phase
+               bam_init_header_hash(h);
+               set = loadpos(fn_list, h);
+               free(fn_list);
+       } else g.flag &= ~FLAG_LIST_EXCL;
+       if (g.pre) { // open BAMs to write
+               char *s = malloc(strlen(g.pre) + 20);
+               strcpy(s, g.pre); strcat(s, ".0.bam"); g.out[0] = bam_open(s, "w");
+               strcpy(s, g.pre); strcat(s, ".1.bam"); g.out[1] = bam_open(s, "w");
+               strcpy(s, g.pre); strcat(s, ".chimera.bam"); g.out[2] = bam_open(s, "w");
+               for (c = 0; c <= 2; ++c) bam_header_write(g.out[c], h);
+               free(s);
+       }
+
+       iter = bam_plp_init(readaln, &g);
+       g.vpos_shift = 0;
+       seqs = kh_init(64);
+       em = errmod_init(1. - 0.83);
+       bases = calloc(g.max_depth, 2);
+       printf("CC\n");
+       printf("CC\tDescriptions:\nCC\n");
+       printf("CC\t  CC      comments\n");
+       printf("CC\t  PS      start of a phase set\n");
+       printf("CC\t  FL      filtered region\n");
+       printf("CC\t  M[012]  markers; 0 for singletons, 1 for phased and 2 for filtered\n");
+       printf("CC\t  EV      supporting reads; SAM format\n");
+       printf("CC\t  //      end of a phase set\nCC\n");
+       printf("CC\tFormats of PS, FL and M[012] lines (1-based coordinates):\nCC\n");
+       printf("CC\t  PS  chr  phaseSetStart  phaseSetEnd\n");
+       printf("CC\t  FL  chr  filterStart    filterEnd\n");
+       printf("CC\t  M?  chr  PS  pos  allele0  allele1  hetIndex  #supports0  #errors0  #supp1  #err1\n");
+       printf("CC\nCC\n");
+       fflush(stdout);
+       while ((plp = bam_plp_auto(iter, &tid, &pos, &n)) != 0) {
+               int i, k, c, tmp, dophase = 1, in_set = 0;
+               float q[16];
+               if (tid < 0) break;
+               if (tid != lasttid) { // change of chromosome
+                       g.vpos_shift = 0;
+                       if (lasttid >= 0) {
+                               seqs = shrink_hash(seqs);
+                               phase(&g, h->target_name[lasttid], vpos, cns, seqs);
+                               update_vpos(0x7fffffff, seqs);
+                       }
+                       lasttid = tid;
+                       vpos = 0;
+               }
+               if (set && kh_get(set64, set, (uint64_t)tid<<32 | pos) != kh_end(set)) in_set = 1;
+               if (n > g.max_depth) continue; // do not proceed if the depth is too high
+               // fill the bases array and check if there is a variant
+               for (i = k = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = plp + i;
+                       uint8_t *seq;
+                       int q, baseQ, b;
+                       if (p->is_del || p->is_refskip) continue;
+                       baseQ = bam1_qual(p->b)[p->qpos];
+                       if (baseQ < g.min_baseQ) continue;
+                       seq = bam1_seq(p->b);
+                       b = bam_nt16_nt4_table[bam1_seqi(seq, p->qpos)];
+                       if (b > 3) continue;
+                       q = baseQ < p->b->core.qual? baseQ : p->b->core.qual;
+                       if (q < 4) q = 4;
+                       if (q > 63) q = 63;
+                       bases[k++] = q<<5 | (int)bam1_strand(p->b)<<4 | b;
+               }
+               if (k == 0) continue;
+               errmod_cal(em, k, 4, bases, q); // compute genotype likelihood
+               c = gl2cns(q); // get the consensus
+               // tell if to proceed
+               if (set && (g.flag&FLAG_LIST_EXCL) && !in_set) continue; // not in the list
+               if (!in_set && (c&0xffff)>>2 < g.min_varLOD) continue; // not a variant
+               // add the variant
+               if (vpos == max_vpos) {
+                       max_vpos = max_vpos? max_vpos<<1 : 128;
+                       cns = realloc(cns, max_vpos * 8);
+               }
+               cns[vpos] = (uint64_t)pos<<32 | c;
+               for (i = 0; i < n; ++i) {
+                       const bam_pileup1_t *p = plp + i;
+                       uint64_t key;
+                       khint_t k;
+                       uint8_t *seq = bam1_seq(p->b);
+                       frag_t *f;
+                       if (p->is_del || p->is_refskip) continue;
+                       if (p->b->core.qual == 0) continue;
+                       // get the base code
+                       c = nt16_nt4_table[(int)bam1_seqi(seq, p->qpos)];
+                       if (c == (cns[vpos]&3)) c = 1;
+                       else if (c == (cns[vpos]>>16&3)) c = 2;
+                       else c = 0;
+                       // write to seqs
+                       key = X31_hash_string(bam1_qname(p->b));
+                       k = kh_put(64, seqs, key, &tmp);
+                       f = &kh_val(seqs, k);
+                       if (tmp == 0) { // present in the hash table
+                               if (vpos - f->vpos + 1 < MAX_VARS) {
+                                       f->vlen = vpos - f->vpos + 1;
+                                       f->seq[f->vlen-1] = c;
+                                       f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+                               }
+                               dophase = 0;
+                       } else { // absent
+                               memset(f->seq, 0, MAX_VARS);
+                               f->beg = p->b->core.pos;
+                               f->end = bam_calend(&p->b->core, bam1_cigar(p->b));
+                               f->vpos = vpos, f->vlen = 1, f->seq[0] = c, f->single = f->phased = f->flip = f->ambig = 0;
+                       }
+               }
+               if (dophase) {
+                       seqs = shrink_hash(seqs);
+                       phase(&g, h->target_name[tid], vpos, cns, seqs);
+                       update_vpos(vpos, seqs);
+                       cns[0] = cns[vpos];
+                       vpos = 0;
+               }
+               ++vpos;
+       }
+       if (tid >= 0) phase(&g, h->target_name[tid], vpos, cns, seqs);
+       bam_header_destroy(h);
+       bam_plp_destroy(iter);
+       bam_close(g.fp);
+       kh_destroy(64, seqs);
+       kh_destroy(set64, set);
+       free(cns);
+       errmod_destroy(em);
+       free(bases);
+       if (g.pre) {
+               for (c = 0; c <= 2; ++c) bam_close(g.out[c]);
+               free(g.pre); free(g.b);
+       }
+       return 0;
+}
author	Bo Li <bli@cs.wisc.edu>
	Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)
committer	Bo Li <bli@cs.wisc.edu>
	Sun, 4 Dec 2011 05:55:03 +0000 (23:55 -0600)
EM.cpp		patch \| blob \| history
PairedEndModel.h		patch \| blob \| history
PairedEndQModel.h		patch \| blob \| history
PairedEndRead.h		patch \| blob \| history
PairedEndReadQ.h		patch \| blob \| history
README.md		patch \| blob \| history
ReadReader.h		patch \| blob \| history
RefSeq.h		patch \| blob \| history
Refs.h		patch \| blob \| history
SingleModel.h		patch \| blob \| history
SingleQModel.h		patch \| blob \| history
SingleRead.h		patch \| blob \| history
SingleReadQ.h		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history
rsem-plot-transcript-wiggles		patch \| blob \| history
sam/bam2depth.c	[new file with mode: 0644]	patch \| blob
sam/bam_cat.c	[new file with mode: 0644]	patch \| blob
sam/bcftools/em.c	[new file with mode: 0644]	patch \| blob
sam/bcftools/kmin.c	[new file with mode: 0644]	patch \| blob
sam/bcftools/kmin.h	[new file with mode: 0644]	patch \| blob
sam/bcftools/mut.c	[new file with mode: 0644]	patch \| blob
sam/bedidx.c	[new file with mode: 0644]	patch \| blob
sam/cut_target.c	[new file with mode: 0644]	patch \| blob
sam/misc/seqtk.c	[new file with mode: 0644]	patch \| blob
sam/phase.c	[new file with mode: 0644]	patch \| blob