lots of changes

author Bo Li <bli@cs.wisc.edu>

Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)

committer Bo Li <bli@cs.wisc.edu>

Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)
author Bo Li <bli@cs.wisc.edu>
Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)
committer Bo Li <bli@cs.wisc.edu>
Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)
diff --git a/EM.cpp b/EM.cpp

index aa7ebdb9011e37b5e58b1b0e00c201b9ca60aed6..ff257680ead4f4f2087515131623a6e61fb498c1 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -61,7 +61,8 @@ bool genBamF; // If user wants to generate bam file, true; otherwise, false.
  bool updateModel, calcExpectedWeights;
  bool genGibbsOut; // generate file for Gibbs sampler
  
-char refName[STRLEN], imdName[STRLEN], outName[STRLEN];
+char refName[STRLEN], outName[STRLEN];
+char imdName[STRLEN], statName[STRLEN];
  char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
  char mparamsF[STRLEN], bmparamsF[STRLEN];
  char modelF[STRLEN], thetaF[STRLEN];
@@ -305,7 +306,7 @@ void writeResults(ModelType& model, double* counts) {
         char outF[STRLEN];
         FILE *fo;
  
-       sprintf(modelF, "%s.model", outName);
+       sprintf(modelF, "%s.model", statName);
         model.write(modelF);
  
         //calculate tau values
@@ -547,15 +548,9 @@ void EM() {
                         }
                 }
                 fclose(fo);
-
-               char scoreF[STRLEN];
-               sprintf(scoreF, "%s.ns", imdName);
-               fo = fopen(scoreF, "w");
-               fprintf(fo, "%.15g\n", model.getLogP());
-               fclose(fo);
         }
  
-       sprintf(thetaF, "%s.theta", outName);
+       sprintf(thetaF, "%s.theta", statName);
         fo = fopen(thetaF, "w");
         fprintf(fo, "%d\n", M + 1);
  
@@ -635,11 +630,11 @@ int main(int argc, char* argv[]) {
         bool quiet = false;
  
         if (argc < 5) {
-               printf("Usage : rsem-run-em refName read_type imdName outName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
+               printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
                 printf("  refName: reference name\n");
                 printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
-               printf("  imdName: name for all upstream/downstream user-unseen files. (different files have different suffices)\n");
-               printf("  outName: name for all output files. (different files have different suffices)\n");
+               printf("  samplePath: sample path.\n");
+               printf("  sampleName: sample name.\n");
                 printf("  -p: number of threads which user wants to use. (default: 1)\n");
                 printf("  -b: produce bam format output file. (default: off)\n");
                 printf("  -q: set it quiet\n");
@@ -652,8 +647,9 @@ int main(int argc, char* argv[]) {
  
         strcpy(refName, argv[1]);
         read_type = atoi(argv[2]);
-       strcpy(imdName, argv[3]);
-       strcpy(outName, argv[4]);
+       strcpy(outName, argv[3]);
+       sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
+       sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
  
         nThreads = 1;
  
@@ -691,7 +687,7 @@ int main(int argc, char* argv[]) {
         sprintf(tiF, "%s.ti", refName);
         transcripts.readFrom(tiF);
  
-       sprintf(cntF, "%s.cnt", imdName);
+       sprintf(cntF, "%s.cnt", statName);
         fin.open(cntF);
         if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); }
         fin>>N0>>N1>>N2>>N_tot;
diff --git a/Gibbs.cpp b/Gibbs.cpp

index 7727f37b73452910e7cb5ad0a5faa37e98a1a5bb..5bdfb247aa2db979cc8c3b345eb50b977462a43e 100644 (file)
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -7,7 +7,8 @@
  #include<sstream>
  #include<vector>
  
-#include "randomc.h"
+#include "boost/random.hpp"
+
  #include "utils.h"
  
  #include "Model.h"
@@ -35,6 +36,7 @@ int model_type;
  int m, M, N0, N1, nHits;
  double totc;
  int BURNIN, CHAINLEN, GAP;
+char imdName[STRLEN], statName[STRLEN];
  char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
  char cvsF[STRLEN];
  
@@ -50,9 +52,11 @@ vector<int> counts;
  bool quiet;
  
  vector<double> arr;
-CRandomMersenne rg(time(NULL));
  
-void load_data(char* reference_name, char* sample_name, char* imdName) {
+boost::mt19937 rng(time(NULL));
+boost::uniform_01<boost::mt19937> rg(rng);
+
+void load_data(char* reference_name, char* statName, char* imdName) {
         ifstream fin;
         string line;
         int tmpVal;
@@ -68,7 +72,7 @@ void load_data(char* reference_name, char* sample_name, char* imdName) {
         m = gi.getm();
  
         //load thetaF
-       sprintf(thetaF, "%s.theta",sample_name);
+       sprintf(thetaF, "%s.theta",statName);
         fin.open(thetaF);
         if (!fin.is_open()) {
                 fprintf(stderr, "Cannot open %s!\n", thetaF);
@@ -123,7 +127,7 @@ void load_data(char* reference_name, char* sample_name, char* imdName) {
  // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
  int sample(vector<double>& arr, int len) {
    int l, r, mid;
-  double prb = rg.Random() * arr[len - 1];
+  double prb = rg() * arr[len - 1];
  
    l = 0; r = len - 1;
    while (l <= r) {
@@ -334,14 +338,16 @@ void writeEstimatedParameters(char* modelF, char* imdName) {
  
  int main(int argc, char* argv[]) {
         if (argc < 7) {
-               printf("Usage: rsem-run-gibbs reference_name sample_name imdName BURNIN CHAINLEN GAP [-q]\n");
+               printf("Usage: rsem-run-gibbs reference_name sample_name sampleToken BURNIN CHAINLEN GAP [-q]\n");
                 exit(-1);
         }
  
         BURNIN = atoi(argv[4]);
         CHAINLEN = atoi(argv[5]);
         GAP = atoi(argv[6]);
-       load_data(argv[1], argv[2], argv[3]);
+       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+       sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+       load_data(argv[1], statName, imdName);
  
         quiet = false;
         if (argc > 7 && !strcmp(argv[7], "-q")) {
@@ -350,19 +356,19 @@ int main(int argc, char* argv[]) {
         verbose = !quiet;
  
         init();
-       Gibbs(argv[3]);
+       Gibbs(imdName);
  
-       sprintf(modelF, "%s.model", argv[2]);
+       sprintf(modelF, "%s.model", statName);
         FILE *fi = fopen(modelF, "r");
         if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
         fscanf(fi, "%d", &model_type);
         fclose(fi);
  
         switch(model_type) {
-       case 0 : writeEstimatedParameters<SingleModel>(modelF, argv[3]); break;
-       case 1 : writeEstimatedParameters<SingleQModel>(modelF, argv[3]); break;
-       case 2 : writeEstimatedParameters<PairedEndModel>(modelF, argv[3]); break;
-       case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, argv[3]); break;
+       case 0 : writeEstimatedParameters<SingleModel>(modelF, imdName); break;
+       case 1 : writeEstimatedParameters<SingleQModel>(modelF, imdName); break;
+       case 2 : writeEstimatedParameters<PairedEndModel>(modelF, imdName); break;
+       case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, imdName); break;
         }
  
         return 0;
diff --git a/SamParser.h b/SamParser.h

index a36af3a0d11a7133b27c5ee4fe83ae7996b9f631..335984e7394f2645ef4ee178032c57799c69d312 100644 (file)
--- a/SamParser.h
+++ b/SamParser.h
@@ -1,4 +1,4 @@
-/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT single read or paired-end read */
+/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */
  #ifndef SAMPARSER_H_
  #define SAMPARSER_H_
  
@@ -10,7 +10,12 @@
  
  #include "sam/bam.h"
  #include "sam/sam.h"
+
  #include "utils.h"
+
+#include "Transcript.h"
+#include "Transcripts.h"
+
  #include "SingleRead.h"
  #include "SingleReadQ.h"
  #include "PairedEndRead.h"
@@ -20,7 +25,7 @@
  
  class SamParser {
  public:
-       SamParser(char, const char*, const char* = 0);
+       SamParser(char, const char*, Transcripts&, const char* = 0);
         ~SamParser();
  
         /**
@@ -46,6 +51,7 @@ private:
         bam_header_t *header;
         bam1_t *b, *b2;
  
+
         //tag used by aligner
         static char rtTag[STRLEN];
  
@@ -64,12 +70,16 @@ private:
         //0 ~ N0 1 ~ N1 2 ~ N2
         int getReadType(const bam1_t*);
         int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads
+
+       bool check(bam1_t *b) {
+               return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT));
+       }
  };
  
  char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
  
  // aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, const char* aux) {
+SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) {
         switch(inpType) {
         case 'b': sam_in = samopen(inpF, "rb", aux); break;
         case 's': sam_in = samopen(inpF, "r", aux); break;
@@ -80,6 +90,20 @@ SamParser::SamParser(char inpType, const char* inpF, const char* aux) {
      header = sam_in->header;
      if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
  
+    // Check if the reference used for aligner is the transcript set RSEM generated
+    if (transcripts.getM() != header->n_targets) {
+       fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
+       exit(-1);
+    }
+    for (int i = 0; i < header->n_targets; i++) {
+       const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
+       // If update int to long, chance the (int) conversion
+       if (transcript.getTranscriptID().compare(header->target_name[i]) != 0 || transcript.getLength() != (int)header->target_len[i]) {
+               fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
+               exit(-1);
+       }
+    }
+
      b = bam_init1();
      b2 = bam_init1();
  }
@@ -110,6 +134,8 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) {
         else val = 5;
  
         if (readType == 1) {
+               if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
                 if (getDir(b) > 0) {
                         hit = SingleHit(b->core.tid + 1, b->core.pos);
                 }
@@ -139,6 +165,8 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) {
         else val = 5;
  
         if (readType == 1) {
+               if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
                 if (getDir(b) > 0) {
                         hit = SingleHit(b->core.tid + 1, b->core.pos);
                 }
@@ -184,6 +212,8 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
         else val = 5;
  
         if (readType == 1) {
+               if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
                 if (mp1->core.tid != mp2->core.tid) {
                         fprintf(stderr, "The two reads do not come from the same pair!");
                         exit(-1);
@@ -233,6 +263,8 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
         else val = 5;
  
         if (readType == 1) {
+               if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
                 if (mp1->core.tid != mp2->core.tid) {
                         fprintf(stderr, "The two reads do not come from the same pair!");
                         exit(-1);
diff --git a/calcCI.cpp b/calcCI.cpp

index 321cd4ac7e5319ea4b98998c6ab0d24cd1a5d271..c9ccce5909a2be59baabe8b3345232256e8a2da4 100644 (file)
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -59,6 +59,7 @@ generator_type **rgs;
  int M, m;
  Refs refs;
  GroupInfo gi;
+char imdName[STRLEN], statName[STRLEN];
  char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
  
  vector<double> eel; //expected effective lengths
@@ -327,17 +328,17 @@ void generateResults(char* imdName) {
  
         if (verbose) { printf("All credibility intervals are calculated!\n"); }
  
-        sprintf(outF, "%s.tau_denoms", imdName);
-        fo = fopen(outF, "w");
-        fprintf(fo, "%d\n", nSamples);
-        for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
-        fprintf(fo, "\n");
-        fclose(fo);
+       sprintf(outF, "%s.tau_denoms", imdName);
+       fo = fopen(outF, "w");
+       fprintf(fo, "%d\n", nSamples);
+       for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
+       fprintf(fo, "\n");
+       fclose(fo);
  }
  
  int main(int argc, char* argv[]) {
         if (argc < 7) {
-               printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name imdName confidence nSpC nMB[-q]\n");
+               printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nSpC nMB[-q]\n");
                 exit(-1);
         }
  
@@ -351,7 +352,10 @@ int main(int argc, char* argv[]) {
         }
         verbose = !quiet;
  
-       sprintf(modelF, "%s.model", argv[2]);
+       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+       sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+
+       sprintf(modelF, "%s.model", statName);
         FILE *fi = fopen(modelF, "r");
         if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
         fscanf(fi, "%d", &model_type);
@@ -364,8 +368,8 @@ int main(int argc, char* argv[]) {
         gi.load(groupF);
         m = gi.getm();
  
-       sprintf(tmpF, "%s.tmp", argv[3]);
-       sprintf(cvsF, "%s.countvectors", argv[3]);
+       sprintf(tmpF, "%s.tmp", imdName);
+       sprintf(cvsF, "%s.countvectors", imdName);
  
         switch(model_type) {
         case 0 : sampling<SingleModel>(); break;
@@ -374,15 +378,15 @@ int main(int argc, char* argv[]) {
         case 3 : sampling<PairedEndQModel>(); break;
         }
  
-       generateResults(argv[3]);
+       generateResults(imdName);
  
         delete[] tau_denoms;
  
-               sprintf(command, "rm -f %s", tmpF);
+       sprintf(command, "rm -f %s", tmpF);
         int status = system(command);
         if (status != 0) {
-         fprintf(stderr, "Cannot delete %s!\n", tmpF);
-         exit(-1);
+               fprintf(stderr, "Cannot delete %s!\n", tmpF);
+               exit(-1);
         }
  
         return 0;
diff --git a/makefile b/makefile

index e789a1ee5e117bd1e4e50045b4b6f467679bb567..df8d2a1c1605592b1e3cf4b346458b9b3529c22b 100644 (file)
--- a/makefile
+++ b/makefile
@@ -48,13 +48,13 @@ PairedEndHit.h : SingleHit.h
  HitContainer.h : GroupInfo.h
  
  
-SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h
+SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcript.h Transcripts.h
  
  
  rsem-parse-alignments : parseIt.o sam/libbam.a
         $(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz 
  
-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h parseIt.cpp
+parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcript.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
         $(CC) $(COFLAGS) parseIt.cpp
  
  
@@ -85,22 +85,17 @@ EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.
  rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp
         $(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig
  
-mersenne.o : mersenne.cpp randomc.h
-       $(CC) $(COFLAGS) mersenne.cpp
+rsem-simulate-reads : simulation.o
+       $(CC) -o rsem-simulate-reads simulation.o
  
-simul_mersenne.h : simul.h randomc.h
-
-rsem-simulate-reads : simulation.o mersenne.o
-       $(CC) -o rsem-simulate-reads simulation.o mersenne.o
-
-simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h simul_mersenne.h randomc.h simulation.cpp
+simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp
         $(CC) $(COFLAGS) simulation.cpp
  
-rsem-run-gibbs : Gibbs.o mersenne.o
-       $(CC) -o rsem-run-gibbs Gibbs.o mersenne.o 
+rsem-run-gibbs : Gibbs.o
+       $(CC) -o rsem-run-gibbs Gibbs.o 
  
  #some header files are omitted
-Gibbs.o : randomc.h utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp
+Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp 
         $(CC) $(COFLAGS) Gibbs.cpp
  
  rsem-calculate-credibility-intervals : calcCI.o
diff --git a/mersenne.cpp b/mersenne.cpp

deleted file mode 100644 (file)

index d9be809..0000000
--- a/mersenne.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/************************** MERSENNE.CPP ******************** AgF 2007-08-01 *\r
-*  Random Number generator 'Mersenne Twister'                                *\r
-*                                                                            *\r
-*  This random number generator is described in the article by               *\r
-*  M. Matsumoto & T. Nishimura, in:                                          *\r
-*  ACM Transactions on Modeling and Computer Simulation,                     *\r
-*  vol. 8, no. 1, 1998, pp. 3-30.                                            *\r
-*  Details on the initialization scheme can be found at                      *\r
-*  http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html                  *\r
-*                                                                            *\r
-*  Experts consider this an excellent random number generator.               *\r
-*                                                                            *\r
-*  © 2001 - 2007 A. Fog. Published under the GNU General Public License      *\r
-*  (www.gnu.org/copyleft/gpl.html) with the further restriction that it      *\r
-*  cannot be used for gambling applications.                                 *\r
-*****************************************************************************/\r
-\r
-#include "randomc.h"\r
-\r
-void CRandomMersenne::Init0(uint32 seed) {\r
-   // Detect computer architecture\r
-   union {double f; uint32 i[2];} convert;\r
-   convert.f = 1.0;\r
-   if (convert.i[1] == 0x3FF00000) Architecture = LITTLE_ENDIAN1;\r
-   else if (convert.i[0] == 0x3FF00000) Architecture = BIG_ENDIAN1;\r
-   else Architecture = NONIEEE;\r
-\r
-   // Seed generator\r
-   mt[0]= seed;\r
-   for (mti=1; mti < MERS_N; mti++) {\r
-      mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\r
-   }\r
-}\r
-\r
-void CRandomMersenne::RandomInit(uint32 seed) {\r
-   // Initialize and seed\r
-   Init0(seed);\r
-\r
-   // Randomize some more\r
-   for (int i = 0; i < 37; i++) BRandom();\r
-}\r
-\r
-\r
-void CRandomMersenne::RandomInitByArray(uint32 seeds[], int length) {\r
-   // Seed by more than 32 bits\r
-   int i, j, k;\r
-\r
-   // Initialize\r
-   Init0(19650218);\r
-\r
-   if (length <= 0) return;\r
-\r
-   // Randomize mt[] using whole seeds[] array\r
-   i = 1;  j = 0;\r
-   k = (MERS_N > length ? MERS_N : length);\r
-   for (; k; k--) {\r
-      mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) + seeds[j] + j;\r
-      i++; j++;\r
-      if (i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;}\r
-      if (j >= length) j=0;}\r
-   for (k = MERS_N-1; k; k--) {\r
-      mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) - i;\r
-      if (++i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;}}\r
-   mt[0] = 0x80000000UL;  // MSB is 1; assuring non-zero initial array\r
-\r
-   // Randomize some more\r
-   mti = 0;\r
-   for (int i = 0; i <= MERS_N; i++) BRandom();\r
-}\r
-\r
-\r
-uint32 CRandomMersenne::BRandom() {\r
-   // Generate 32 random bits\r
-   uint32 y;\r
-\r
-   if (mti >= MERS_N) {\r
-      // Generate MERS_N words at one time\r
-      const uint32 LOWER_MASK = (1LU << MERS_R) - 1;       // Lower MERS_R bits\r
-      const uint32 UPPER_MASK = 0xFFFFFFFF << MERS_R;      // Upper (32 - MERS_R) bits\r
-      static const uint32 mag01[2] = {0, MERS_A};\r
-\r
-      int kk;\r
-      for (kk=0; kk < MERS_N-MERS_M; kk++) {    \r
-         y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);\r
-         mt[kk] = mt[kk+MERS_M] ^ (y >> 1) ^ mag01[y & 1];}\r
-\r
-      for (; kk < MERS_N-1; kk++) {    \r
-         y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);\r
-         mt[kk] = mt[kk+(MERS_M-MERS_N)] ^ (y >> 1) ^ mag01[y & 1];}      \r
-\r
-      y = (mt[MERS_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK);\r
-      mt[MERS_N-1] = mt[MERS_M-1] ^ (y >> 1) ^ mag01[y & 1];\r
-      mti = 0;\r
-   }\r
-\r
-   y = mt[mti++];\r
-\r
-#if 1\r
-   // Tempering (May be omitted):\r
-   y ^=  y >> MERS_U;\r
-   y ^= (y << MERS_S) & MERS_B;\r
-   y ^= (y << MERS_T) & MERS_C;\r
-   y ^=  y >> MERS_L;\r
-#endif\r
-\r
-   return y;\r
-}\r
-\r
-\r
-double CRandomMersenne::Random() {\r
-   // Output random float number in the interval 0 <= x < 1\r
-   union {double f; uint32 i[2];} convert;\r
-   uint32 r = BRandom();               // Get 32 random bits\r
-   // The fastest way to convert random bits to floating point is as follows:\r
-   // Set the binary exponent of a floating point number to 1+bias and set\r
-   // the mantissa to random bits. This will give a random number in the \r
-   // interval [1,2). Then subtract 1.0 to get a random number in the interval\r
-   // [0,1). This procedure requires that we know how floating point numbers\r
-   // are stored. The storing method is tested in function RandomInit and saved \r
-   // in the variable Architecture.\r
-\r
-   // This shortcut allows the compiler to optimize away the following switch\r
-   // statement for the most common architectures:\r
-#if defined(_M_IX86) || defined(_M_X64) || defined(__LITTLE_ENDIAN__)\r
-   Architecture = LITTLE_ENDIAN1;\r
-#elif defined(__BIG_ENDIAN__)\r
-   Architecture = BIG_ENDIAN1;\r
-#endif\r
-\r
-   switch (Architecture) {\r
-   case LITTLE_ENDIAN1:\r
-      convert.i[0] =  r << 20;\r
-      convert.i[1] = (r >> 12) | 0x3FF00000;\r
-      return convert.f - 1.0;\r
-   case BIG_ENDIAN1:\r
-      convert.i[1] =  r << 20;\r
-      convert.i[0] = (r >> 12) | 0x3FF00000;\r
-      return convert.f - 1.0;\r
-   case NONIEEE: default: ;\r
-   } \r
-   // This somewhat slower method works for all architectures, including \r
-   // non-IEEE floating point representation:\r
-   return (double)r * (1./((double)(uint32)(-1L)+1.));\r
-}\r
-\r
-\r
-int CRandomMersenne::IRandom(int min, int max) {\r
-   // Output random integer in the interval min <= x <= max\r
-   // Relative error on frequencies < 2^-32\r
-   if (max <= min) {\r
-      if (max == min) return min; else return 0x80000000;\r
-   }\r
-   // Multiply interval with random and truncate\r
-   int r = int((max - min + 1) * Random()) + min; \r
-   if (r > max) r = max;\r
-   return r;\r
-}\r
-\r
-\r
-int CRandomMersenne::IRandomX(int min, int max) {\r
-   // Output random integer in the interval min <= x <= max\r
-   // Each output value has exactly the same probability.\r
-   // This is obtained by rejecting certain bit values so that the number\r
-   // of possible bit values is divisible by the interval length\r
-   if (max <= min) {\r
-      if (max == min) return min; else return 0x80000000;\r
-   }\r
-#ifdef  INT64_DEFINED\r
-   // 64 bit integers available. Use multiply and shift method\r
-   uint32 interval;                    // Length of interval\r
-   uint64 longran;                     // Random bits * interval\r
-   uint32 iran;                        // Longran / 2^32\r
-   uint32 remainder;                   // Longran % 2^32\r
-\r
-   interval = uint32(max - min + 1);\r
-   if (interval != LastInterval) {\r
-      // Interval length has changed. Must calculate rejection limit\r
-      // Reject when remainder = 2^32 / interval * interval\r
-      // RLimit will be 0 if interval is a power of 2. No rejection then\r
-      RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;\r
-      LastInterval = interval;\r
-   }\r
-   do { // Rejection loop\r
-      longran  = (uint64)BRandom() * interval;\r
-      iran = (uint32)(longran >> 32);\r
-      remainder = (uint32)longran;\r
-   } while (remainder > RLimit);\r
-   // Convert back to signed and return result\r
-   return (int32)iran + min;\r
-\r
-#else\r
-   // 64 bit integers not available. Use modulo method\r
-   uint32 interval;                    // Length of interval\r
-   uint32 bran;                        // Random bits\r
-   uint32 iran;                        // bran / interval\r
-   uint32 remainder;                   // bran % interval\r
-\r
-   interval = uint32(max - min + 1);\r
-   if (interval != LastInterval) {\r
-      // Interval length has changed. Must calculate rejection limit\r
-      // Reject when iran = 2^32 / interval\r
-      // We can't make 2^32 so we use 2^32-1 and correct afterwards\r
-      RLimit = (uint32)0xFFFFFFFF / interval;\r
-      if ((uint32)0xFFFFFFFF % interval == interval - 1) RLimit++;\r
-   }\r
-   do { // Rejection loop\r
-      bran = BRandom();\r
-      iran = bran / interval;\r
-      remainder = bran % interval;\r
-   } while (iran >= RLimit);\r
-   // Convert back to signed and return result\r
-   return (int32)remainder + min;\r
-\r
-#endif\r
-}\r
diff --git a/parseIt.cpp b/parseIt.cpp

index c266715b31693686f7db773155d01e7905321102..55dd954d3d1e2059911987ed5a8521748b16e9a1 100644 (file)
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -8,11 +8,15 @@
  #include<iostream>
  #include<fstream>
  #include<string>
+#include<map>
  
  #include "utils.h"
  
  #include "GroupInfo.h"
  
+#include "Transcript.h"
+#include "Transcripts.h"
+
  #include "SingleRead.h"
  #include "SingleReadQ.h"
  #include "PairedEndRead.h"
@@ -30,10 +34,12 @@ int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number o
  int nHits; // # of hits
  int nUnique, nMulti, nIsoMulti;
  char fn_list[STRLEN];
-char groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
+char imdName[STRLEN];
  char datF[STRLEN], cntF[STRLEN];
  
  GroupInfo gi;
+Transcripts transcripts;
  
  SamParser *parser;
  ofstream hit_out;
@@ -42,14 +48,14 @@ int n_os; // number of ostreams
  ostream *cat[3][2]; // cat : category  1-dim 0 N0 1 N1 2 N2; 2-dim  0 mate1 1 mate2
  char readOutFs[3][2][STRLEN];
  
-void init(const char* imdName, char alignFType, const char* alignF) {
+map<int, int> counter;
+map<int, int>::iterator iter;
  
-       sprintf(datF, "%s.dat", imdName);
-       sprintf(cntF, "%s.cnt", imdName);
+void init(const char* imdName, char alignFType, const char* alignF) {
  
         char* aux = 0;
         if (strcmp(fn_list, "")) aux = fn_list;
-       parser = new SamParser(alignFType, alignF, aux);
+       parser = new SamParser(alignFType, alignF, transcripts, aux);
  
         memset(cat, 0, sizeof(cat));
         memset(readOutFs, 0, sizeof(readOutFs));
@@ -64,6 +70,8 @@ void init(const char* imdName, char alignFType, const char* alignF) {
                 for (int j = 0; j < n_os; j++)
                         cat[i][j] = new ofstream(readOutFs[i][j]);
         }
+
+       counter.clear();
  }
  
  //Do not allow duplicate for unalignable reads and supressed reads in SAM input
@@ -93,6 +101,14 @@ void parseIt(SamParser *parser) {
                                 nMulti += hits.calcNumGeneMultiReads(gi);
                                 nIsoMulti += hits.calcNumIsoformMultiReads();
                                 hits.write(hit_out);
+
+                               iter = counter.find(hits.getNHits());
+                               if (iter != counter.end()) {
+                                       iter->second++;
+                               }
+                               else {
+                                       counter[hits.getNHits()] = 1;
+                               }
                         }
  
                         hits.clear();
@@ -116,6 +132,14 @@ void parseIt(SamParser *parser) {
                 nMulti += hits.calcNumGeneMultiReads(gi);
                 nIsoMulti += hits.calcNumIsoformMultiReads();
                 hits.write(hit_out);
+
+               iter = counter.find(hits.getNHits());
+               if (iter != counter.end()) {
+                       iter->second++;
+               }
+               else {
+                       counter[hits.getNHits()] = 1;
+               }
         }
  
         nUnique = N[1] - nMulti;
@@ -138,15 +162,15 @@ void release() {
  int main(int argc, char* argv[]) {
         bool quiet = false;
  
-       if (argc < 5) {
-               printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+       if (argc < 6) {
+               printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
                 exit(-1);
         }
  
         strcpy(fn_list, "");
         read_type = 0;
-       if (argc > 5) {
-               for (int i = 5; i < argc; i++) {
+       if (argc > 6) {
+               for (int i = 6; i < argc; i++) {
                         if (!strcmp(argv[i], "-t")) {
                                 read_type = atoi(argv[i + 1]);
                         }
@@ -162,10 +186,16 @@ int main(int argc, char* argv[]) {
  
         verbose = !quiet;
  
-       init(argv[2], argv[3][0], argv[4]);
-
         sprintf(groupF, "%s.grp", argv[1]);
         gi.load(groupF);
+       sprintf(tiF, "%s.ti", argv[1]);
+       transcripts.readFrom(tiF);
+
+       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+       sprintf(datF, "%s.dat", imdName);
+       sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
+
+       init(imdName, argv[4][0], argv[5]);
  
         hit_out.open(datF);
  
@@ -190,6 +220,9 @@ int main(int argc, char* argv[]) {
         fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
         fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
         fout<<nHits<<" "<<read_type<<endl;
+       for (iter = counter.begin(); iter != counter.end(); iter++) {
+               fout<<iter->first<<'\t'<<iter->second<<endl;
+       }
         fout.close();
  
         release();
diff --git a/randomc.h b/randomc.h

deleted file mode 100644 (file)

index d5c8e9f..0000000
--- a/randomc.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*************************** RANDOMC.H ***************** 2007-09-22 Agner Fog *\r
-*\r
-* This file contains class declarations and other definitions for the C++ \r
-* library of uniform random number generators.\r
-*\r
-* Overview of classes:\r
-* ====================\r
-*\r
-* class CRandomMersenne:\r
-* Random number generator of type Mersenne twister.\r
-* Source file mersenne.cpp\r
-*\r
-* class CRandomMother:\r
-* Random number generator of type Mother-of-All (Multiply with carry).\r
-* Source file mother.cpp\r
-*\r
-*\r
-* Member functions (methods):\r
-* ===========================\r
-*\r
-* All these classes have identical member functions:\r
-*\r
-* Constructor(uint32 seed):\r
-* The seed can be any integer. Usually the time is used as seed.\r
-* Executing a program twice with the same seed will give the same sequence of\r
-* random numbers. A different seed will give a different sequence.\r
-*\r
-* void RandomInit(uint32 seed);\r
-* Re-initializes the random number generator with a new seed.\r
-*\r
-* void RandomInitByArray(uint32 seeds[], int length);\r
-* In CRandomMersenne only: Use this function if you want to initialize with\r
-* a seed with more than 32 bits. All bits in the seeds[] array will influence\r
-* the sequence of random numbers generated. length is the number of entries\r
-* in the seeds[] array.\r
-*\r
-* double Random();\r
-* Gives a floating point random number in the interval 0 <= x < 1.\r
-* The resolution is 32 bits in CRandomMother and CRandomMersenne.\r
-*\r
-* int IRandom(int min, int max);\r
-* Gives an integer random number in the interval min <= x <= max.\r
-* (max-min < MAXINT).\r
-* The precision is 2^-32 (defined as the difference in frequency between \r
-* possible output values). The frequencies are exact if max-min+1 is a\r
-* power of 2.\r
-*\r
-* int IRandomX(int min, int max);\r
-* Same as IRandom, but exact. In CRandomMersenne only.\r
-* The frequencies of all output values are exactly the same for an \r
-* infinitely long sequence. (Only relevant for extremely long sequences).\r
-*\r
-* uint32 BRandom();\r
-* Gives 32 random bits. \r
-*\r
-*\r
-* Example:\r
-* ========\r
-* The file EX-RAN.CPP contains an example of how to generate random numbers.\r
-*\r
-*\r
-* Optimized version:\r
-* ==================\r
-* Faster versions of these random number generators are provided as function\r
-* libraries in asmlib.zip. These function libraries are coded in assembly\r
-* language and support only x86 platforms, including 32-bit and 64-bit\r
-* Windows, Linux, BSD, Mac OS-X (Intel based). Use asmlibran.h from asmlib.zip\r
-*\r
-*\r
-* Non-uniform random number generators:\r
-* =====================================\r
-* Random number generators with various non-uniform distributions are available\r
-* in stocc.zip (www.agner.org/random).\r
-*\r
-*\r
-* Further documentation:\r
-* ======================\r
-* The file randomc.htm contains further documentation on these random number\r
-* generators.\r
-*\r
-*\r
-* Copyright:\r
-============\r
-* © 1997 - 2007 Agner Fog. All software in this library is published under the\r
-* GNU General Public License with the further restriction that it cannot be \r
-* used for gambling applications. See licence.htm\r
-*******************************************************************************/\r
-\r
-#ifndef RANDOMC_H\r
-#define RANDOMC_H\r
-\r
-\r
-// Define 32 bit signed and unsigned integers.\r
-// Change these definitions, if necessary, to match a particular platform\r
-#if defined(_WIN16) || defined(__MSDOS__) || defined(_MSDOS) \r
-   // 16 bit systems use long int for 32 bit integer\r
-   typedef long int           int32;   // 32 bit signed integer\r
-   typedef unsigned long int  uint32;  // 32 bit unsigned integer\r
-#else\r
-   // Most other systems use int for 32 bit integer\r
-   typedef int                int32;   // 32 bit signed integer\r
-   typedef unsigned int       uint32;  // 32 bit unsigned integer\r
-#endif\r
-\r
-// Define 64 bit signed and unsigned integers, if possible\r
-#if (defined(__WINDOWS__) || defined(_WIN32)) && (defined(_MSC_VER) || defined(__INTEL_COMPILER))\r
-   // Microsoft and other compilers under Windows use __int64\r
-   typedef __int64            int64;   // 64 bit signed integer\r
-   typedef unsigned __int64   uint64;  // 64 bit unsigned integer\r
-   #define INT64_DEFINED               // Remember that int64 is defined\r
-#elif defined(__unix__) && (defined(_M_IX86) || defined(_M_X64))\r
-   // Gnu and other compilers under Linux etc. use long long\r
-   typedef long long          int64;   // 64 bit signed integer\r
-   typedef unsigned long long uint64;  // 64 bit unsigned integer\r
-   #define INT64_DEFINED               // Remember that int64 is defined\r
-#else\r
-   // 64 bit integers not defined\r
-   // You may include definitions for other platforms here\r
-#endif\r
-\r
-\r
-/***********************************************************************\r
-System-specific user interface functions\r
-***********************************************************************/\r
-\r
-void EndOfProgram(void);               // System-specific exit code (userintf.cpp)\r
-\r
-void FatalError(char * ErrorText);     // System-specific error reporting (userintf.cpp)\r
-\r
-\r
-/***********************************************************************\r
-Define random number generator classes\r
-***********************************************************************/\r
-\r
-class CRandomMersenne {                // Encapsulate random number generator\r
-#if 0\r
-   // Define constants for type MT11213A:\r
-#define MERS_N   351\r
-#define MERS_M   175\r
-#define MERS_R   19\r
-#define MERS_U   11\r
-#define MERS_S   7\r
-#define MERS_T   15\r
-#define MERS_L   17\r
-#define MERS_A   0xE4BD75F5\r
-#define MERS_B   0x655E5280\r
-#define MERS_C   0xFFD58000\r
-#else    \r
-   // or constants for type MT19937:\r
-#define MERS_N   624\r
-#define MERS_M   397\r
-#define MERS_R   31\r
-#define MERS_U   11\r
-#define MERS_S   7\r
-#define MERS_T   15\r
-#define MERS_L   18\r
-#define MERS_A   0x9908B0DF\r
-#define MERS_B   0x9D2C5680\r
-#define MERS_C   0xEFC60000\r
-#endif\r
-public:\r
-   CRandomMersenne(uint32 seed) {      // Constructor\r
-      RandomInit(seed); LastInterval = 0;}\r
-   void RandomInit(uint32 seed);       // Re-seed\r
-   void RandomInitByArray(uint32 seeds[], int length); // Seed by more than 32 bits\r
-   int IRandom (int min, int max);     // Output random integer\r
-   int IRandomX(int min, int max);     // Output random integer, exact\r
-   double Random();                    // Output random float\r
-   uint32 BRandom();                   // Output random bits\r
-private:\r
-   void Init0(uint32 seed);            // Basic initialization procedure\r
-   uint32 mt[MERS_N];                  // State vector\r
-   int mti;                            // Index into mt\r
-   uint32 LastInterval;                // Last interval length for IRandomX\r
-   uint32 RLimit;                      // Rejection limit used by IRandomX\r
-   enum TArch {LITTLE_ENDIAN1, BIG_ENDIAN1, NONIEEE}; // Definition of architecture\r
-   TArch Architecture;                 // Conversion to float depends on architecture\r
-};    \r
-\r
-\r
-class CRandomMother {             // Encapsulate random number generator\r
-public:\r
-   void RandomInit(uint32 seed);       // Initialization\r
-   int IRandom(int min, int max);      // Get integer random number in desired interval\r
-   double Random();                    // Get floating point random number\r
-   uint32 BRandom();                   // Output random bits\r
-   CRandomMother(uint32 seed) {   // Constructor\r
-      RandomInit(seed);}\r
-protected:\r
-   uint32 x[5];                        // History buffer\r
-};\r
-\r
-#endif\r
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 7826ec72680bc551f7c4a167d53e2d9d0b830905..e841e4b91e77b311d0ca33dc54b1bde4d468b14e 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -113,7 +113,7 @@ my $mate1_list = "";
  my $mate2_list = "";
  my $inpF = "";
  
-my ($refName, $taskName, $tmp_dir, $imdName) = ();
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
  my $gap = 32;
  
  if ($paired_end) {
@@ -129,30 +129,31 @@ if (scalar(@ARGV) == 3) {
      if ($is_sam || $is_bam) { $inpF = $ARGV[0]; } 
      else {$mate1_list = $ARGV[0]; }
      $refName = $ARGV[1];
-    $taskName = $ARGV[2];
+    $sampleName = $ARGV[2];
  }
  else {
      $mate1_list = $ARGV[0];
      $mate2_list = $ARGV[1];
      $refName = $ARGV[2];
-    $taskName = $ARGV[3];
+    $sampleName = $ARGV[3];
  }
  
-$tmp_dir = $taskName.".temp";
-my $pos = rindex($taskName, '/');
-if ($pos < 0) {
-    $imdName = "$tmp_dir/$taskName"; 
-}
-else {
-    $imdName = $tmp_dir."/".substr($taskName, $pos + 1);
-}
+my $pos = rindex($sampleName, '/');
+if ($pos < 0) { $sampleToken = $sampleName; }
+else { $sampleToken = substr($sampleName, $pos + 1); }
+
+$temp_dir = "$sampleName.temp";
+$stat_dir = "$sampleName.stat";
+
+if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
+if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
+
+$imdName = "$temp_dir/$sampleToken";
  
  if (!$is_sam && !$is_bam && $phred33 + $phred64 + $solexa == 0) { $phred33 = 1; }
  
  my ($mate_minL, $mate_maxL) = (1, $maxL);
  
-if (!(-d $tmp_dir) && !mkdir($tmp_dir)) { print "Fail to create the directory.\n"; exit(-1); }
-
  if ($bowtie_path ne "") { $bowtie_path .= "/"; }
  
  my ($fn, $dir, $suf) = fileparse($0);
@@ -199,7 +200,7 @@ if (!$is_sam && !$is_bam) {
      $is_sam = 1; # output of bowtie is a sam file
  }
  
-$command = $dir."rsem-parse-alignments $refName $imdName";
+$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
@@ -244,7 +245,7 @@ print OUTPUT "$mean $sd\n";
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
-$command = $dir."rsem-run-em $refName $read_type $imdName $taskName -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
@@ -262,7 +263,7 @@ if ($status != 0) {
  print "\n";
  
  if ($genBamF) {
-    $command = $dir."sam/samtools sort $taskName.bam $taskName.sorted";
+    $command = $dir."sam/samtools sort $sampleName.bam $sampleName.sorted";
      print "$command\n";
      $status = system($command);
      if ($status != 0) {
@@ -270,7 +271,7 @@ if ($genBamF) {
         exit(-1);
      }
      print "\n";
-    $command = $dir."sam/samtools index $taskName.sorted.bam";
+    $command = $dir."sam/samtools index $sampleName.sorted.bam";
      print "$command\n";
      $status = system($command);
      if ($status != 0) {
@@ -280,11 +281,11 @@ if ($genBamF) {
      print "\n";
  }
  
-&collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
  if ($calcCI) {
-    $command = $dir."rsem-run-gibbs $refName $taskName $imdName $BURNIN $CHAINLEN $SAMPLEGAP";
+    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP";
      if ($quiet) { $command .= " -q"; }
      print "$command\n";
      $status = system($command);
@@ -294,12 +295,12 @@ if ($calcCI) {
      }
      print "\n";
  
-    system("mv $taskName.isoforms.results $imdName.isoforms.results.bak1");
-    system("mv $taskName.genes.results $imdName.genes.results.bak1");
-    &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+    system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+    system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
-    $command = $dir."rsem-calculate-credibility-intervals $refName $taskName $imdName $CONFIDENCE $NSPC $NMB";
+    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB";
      if ($quiet) { $command .= " -q"; }
      print "$command\n";
      $status = system($command);
@@ -309,10 +310,10 @@ if ($calcCI) {
      }
      print "\n";
  
-    system("mv $taskName.isoforms.results $imdName.isoforms.results.bak2");
-    system("mv $taskName.genes.results $imdName.genes.results.bak2");
-    &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+    system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+    system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  }
  
  if (!$keep_intermediate_files) {
@@ -582,11 +583,6 @@ file. If no other attributes are given or no GTF file is provided in
  'rsem-prepare-reference', there will be no tab after the
  tau_value field.
  
-=item B<sample_name.model> and B<sample_name.theta>
-
-Output files used by RSEM internally for tasks like simulation,
-compute credibility intervals etc.
-
  =item B<sample_name.bam, sample_name.sorted.bam and sample_name.sorted.bam.bai>
  
  Only generated when --out-bam is specified.
@@ -604,6 +600,9 @@ representing the posterior probability.
  'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
+=item B<sample_name.stat>
+
+This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
  
  =back
  
diff --git a/rsem-plot-model b/rsem-plot-model

index e9225eddf41466424d265cc9b55ef7ebf96b7aa4..f7b84fc0e3664d5441807281afc2ad276aef4b8b 100755 (executable)
--- a/rsem-plot-model
+++ b/rsem-plot-model
@@ -2,13 +2,20 @@
  
  argv <- commandArgs(TRUE)
  if (length(argv) != 2) {
-  cat("Usage: rsem-plot-model modelF outF\n")
+  cat("Usage: rsem-plot-model sample_name outF\n")
    q(status = 1)
  }
  
-con <- file(argv[1], open = "r")       
+strvec <- strsplit(argv[1], split = "/")[[1]]
+token <- strvec[length(strvec)]
+
+modelF <- paste(argv[1], ".stat/", token, ".model")
+cntF <- paste(argv[1], ".stat/", token, ".cnt")
+
  pdf(argv[2])
  
+con <- file(modelF, open = "r")        
+
  # model type and forward probability
  model_type <- as.numeric(readLines(con, n = 4)[1])  
  
@@ -108,5 +115,9 @@ if (model_type == 1 || model_type == 3) {
    legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)       
  }
  
-dev.off()
  close(con)
+
+pair <- read.table(file = cntF, skip = 3, sep = "\t")
+plot(pair[,1], pair[,2], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Among alignable reads, distribution of # of alignments")
+
+dev.off()
diff --git a/simul.h b/simul.h

index 2719b3c9fb46fd67e6479775e3dfc6a1c113945f..ba105dbde965dfadb1deeaeb7def93e3357cbb9b 100644 (file)
--- a/simul.h
+++ b/simul.h
@@ -1,13 +1,17 @@
  #ifndef SIMUL_H_
  #define SIMUL_H_
  
+#include<ctime>
  #include<cassert>
  
+#include "boost/random.hpp"
  
  class simul {
  public:
  
-        virtual ~simul() {}
+       simul() : rg(boost::mt19937(time(NULL))) {
+       }
+
         // interval : [,)
         // random number should be in [0, arr[len - 1])
         // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0 ... len - 1
@@ -29,9 +33,10 @@ public:
           return l;
         }
  
-       virtual double random() { return 0.0; };
+       double random() { return rg(); };
  
  private:
+       boost::uniform_01<boost::mt19937> rg;
  };
  
  #endif /* SIMUL_H_ */
diff --git a/simul_mersenne.h b/simul_mersenne.h

deleted file mode 100644 (file)

index 1a476d1..0000000
--- a/simul_mersenne.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef SIMUL_MERSENNE_H_
-#define SIMUL_MERSENNE_H_
-
-#include<ctime>
-
-#include "randomc.h"
-#include "simul.h"
-
-class simul_mersenne : public simul {
-public:
-       simul_mersenne() : rg(time(NULL)) { }
-
-       virtual ~simul_mersenne() {}
-       virtual double random() { return rg.Random(); }
-
-private:
-       CRandomMersenne rg;
-};
-
-#endif /* SIMUL_MERSENNE_H_ */
diff --git a/simulation.cpp b/simulation.cpp

index ecc0a11a20eb6b91f3bdb4b906cc187bf0cac7aa..2511f33693ce42a3751d07542dca2ce8139a0f54 100644 (file)
--- a/simulation.cpp
+++ b/simulation.cpp
@@ -27,7 +27,7 @@
  #include "Transcript.h"
  #include "Transcripts.h"
  
-#include "simul_mersenne.h"
+#include "simul.h"
  
  using namespace std;
  
@@ -48,7 +48,7 @@ char outReadF[2][STRLEN];
  char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
  char geneResF[STRLEN], isoResF[STRLEN];
  
-simul_mersenne sampler;
+simul sampler;
  
  void genOutReadStreams(int type, char *outFN) {
         switch(type) {
author	Bo Li <bli@cs.wisc.edu>
	Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Sun, 17 Apr 2011 14:57:59 +0000 (09:57 -0500)
EM.cpp		patch \| blob \| history
Gibbs.cpp		patch \| blob \| history
SamParser.h		patch \| blob \| history
calcCI.cpp		patch \| blob \| history
makefile		patch \| blob \| history
mersenne.cpp	[deleted file]	patch \| blob \| history
parseIt.cpp		patch \| blob \| history
randomc.h	[deleted file]	patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history
rsem-plot-model		patch \| blob \| history
simul.h		patch \| blob \| history
simul_mersenne.h	[deleted file]	patch \| blob \| history
simulation.cpp		patch \| blob \| history