Allowed for a subset of reference sequences to be declared in an input SAM/BAM file...

author Bo Li <bli@cs.wisc.edu>

Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)

committer Bo Li <bli@cs.wisc.edu>

Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)
author Bo Li <bli@cs.wisc.edu>
Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)
committer Bo Li <bli@cs.wisc.edu>
Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)
diff --git a/Gibbs.cpp b/Gibbs.cpp

index d5e0b9c49634e29c9a4f2b5ed2a5c74026f5642f..7563dd49f508129992d6a52aa70ed3a8c3130b40 100644 (file)
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -64,6 +64,7 @@ vector<Item> hits;
  vector<double> eel;
  double *mw;
  
+vector<int> pseudo_counts;
  vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
  vector<double> pme_tpm, pme_fpkm;
  
@@ -86,18 +87,6 @@ int m_trans;
  GroupInfo gt, ta;
  vector<double> pve_c_genes, pve_c_trans;
  
-void load_group_info(char* refName) {
-  // Load group info
-  sprintf(groupF, "%s.grp", refName);
-  gi.load(groupF);
-  m = gi.getm();
-  
-  alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific 
-  m_trans = (alleleS ? ta.getm() : 0);
-
-  if (verbose) { printf("Loading group information is finished!\n"); }
-}
-
  void load_data(char* refName, char* statName, char* imdName) {
         ifstream fin;
         string line;
@@ -138,6 +127,29 @@ void load_data(char* refName, char* statName, char* imdName) {
         if (verbose) { printf("Loading data is finished!\n"); }
  }
  
+void load_group_info(char* refName) {
+  // Load group info
+  sprintf(groupF, "%s.grp", refName);
+  gi.load(groupF);
+  m = gi.getm();
+  
+  alleleS = isAlleleSpecific(refName, &gt, &ta); // if allele-specific 
+  m_trans = (alleleS ? ta.getm() : 0);
+
+  if (verbose) { printf("Loading group information is finished!\n"); }
+}
+
+// Load imdName.omit and initialize the pseudo count vector.
+void load_omit_info(const char* imdName) {
+  char omitF[STRLEN];
+  sprintf(omitF, "%s.omit", imdName);
+  FILE *fi = fopen(omitF, "r");
+  pseudo_counts.assign(M + 1, 1);
+  int tid;
+  while (fscanf(fi, "%d", &tid) == 1) pseudo_counts[tid] = 0;
+  fclose(fi);
+}
+
  template<class ModelType>
  void init_model_related(char* modelF) {
         ModelType model;
@@ -206,7 +218,7 @@ void sampleTheta(engine_type& engine, vector<double>& theta) {
         theta.assign(M + 1, 0);
         denom = 0.0;
         for (int i = 0; i <= M; i++) {
-               theta[i] = gmg();
+               theta[i] = (pseudo_counts[i] > 0 ? gmg() : 0.0);
                 denom += theta[i];
         }
         assert(denom > EPSILON);
@@ -226,7 +238,7 @@ void* Gibbs(void* arg) {
         Params *params = (Params*)arg;
  
         vector<double> theta, tpm, fpkm;
-       vector<int> z, counts;
+       vector<int> z, counts(pseudo_counts);
         vector<double> arr;
  
         uniform_01_generator rg(*params->engine, uniform_01_dist());
@@ -235,8 +247,6 @@ void* Gibbs(void* arg) {
         sampleTheta(*params->engine, theta);
  
         z.assign(N1, 0);
-
-       counts.assign(M + 1, 1); // 1 pseudo count
         counts[0] += N0;
  
         for (READ_INT_TYPE i = 0; i < N1; i++) {
@@ -274,8 +284,8 @@ void* Gibbs(void* arg) {
                                 polishTheta(M, theta, eel, mw);
                                 calcExpressionValues(M, theta, eel, tpm, fpkm);
                                 for (int i = 0; i <= M; i++) {
-                                       params->pme_c[i] += counts[i] - 1;
-                                       params->pve_c[i] += double(counts[i] - 1) * (counts[i] - 1);
+                                       params->pme_c[i] += counts[i] - pseudo_counts[i];
+                                       params->pve_c[i] += double(counts[i] - pseudo_counts[i]) * (counts[i] - pseudo_counts[i]);
                                         params->pme_tpm[i] += tpm[i];
                                         params->pme_fpkm[i] += fpkm[i];
                                 }
@@ -283,7 +293,7 @@ void* Gibbs(void* arg) {
                                 for (int i = 0; i < m; i++) {
                                   int b = gi.spAt(i), e = gi.spAt(i + 1);
                                   double count = 0.0;
-                                 for (int j = b; j < e; j++) count += counts[j] - 1;
+                                 for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
                                   params->pve_c_genes[i] += count * count;
                                 }
  
@@ -291,7 +301,7 @@ void* Gibbs(void* arg) {
                                   for (int i = 0; i < m_trans; i++) {
                                     int b = ta.spAt(i), e = ta.spAt(i + 1);
                                     double count = 0.0;
-                                   for (int j = b; j < e; j++) count += counts[j] - 1;
+                                   for (int j = b; j < e; j++) count += counts[j] - pseudo_counts[j];
                                     params->pve_c_trans[i] += count * count;
                                   }
                         }
@@ -410,8 +420,9 @@ int main(int argc, char* argv[]) {
                 printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
         }
  
-       load_group_info(refName);
         load_data(refName, statName, imdName);
+       load_group_info(refName);
+       load_omit_info(imdName);
  
         sprintf(modelF, "%s.model", statName);
         FILE *fi = fopen(modelF, "r");
diff --git a/Makefile b/Makefile

index a9a705f801e461a84d071e3fd44635d564346e3c..54e2603983e77951b627dbe930036d41af4a9bfd 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ sam/libbam.a :
  
  Transcript.h : utils.h
  
-Transcripts.h : my_assert.h Transcript.h
+Transcripts.h : utils.h my_assert.h Transcript.h
  
  rsem-extract-reference-transcripts : utils.h my_assert.h GTFItem.h Transcript.h Transcripts.h extractRef.cpp
         $(CC) -Wall -O3 extractRef.cpp -o rsem-extract-reference-transcripts
diff --git a/SamParser.h b/SamParser.h

index e91f9c379539fc7ada0f8682bcfc7c957489d67f..81d4097cf0033dee55d012abeaf8ea106120b9d9 100644 (file)
--- a/SamParser.h
+++ b/SamParser.h
@@ -25,7 +25,7 @@
  
  class SamParser {
  public:
-       SamParser(char, const char*, Transcripts&, const char* = 0);
+       SamParser(char, const char*, const char*, Transcripts&, const char*);
         ~SamParser();
  
         /**
@@ -80,7 +80,7 @@ private:
  char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
  
  // aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux)
+SamParser::SamParser(char inpType, const char* inpF, const char* aux, Transcripts& transcripts, const char* imdName)
         : transcripts(transcripts)
  {
         switch(inpType) {
@@ -93,7 +93,7 @@ SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, c
         header = sam_in->header;
         general_assert(header != 0, "Fail to parse sam header!");
  
-       transcripts.buildMappings(header->n_targets, header->target_name);
+       transcripts.buildMappings(header->n_targets, header->target_name, imdName);
  
         b = bam_init1();
         b2 = bam_init1();
diff --git a/Transcripts.h b/Transcripts.h

index 033669b740e97737ce4f5f4bc3a25f016a083910..db8e64a879d50bdf1e2e11e8df1037a1e62c5467 100644 (file)
--- a/Transcripts.h
+++ b/Transcripts.h
@@ -13,6 +13,7 @@
  #include<map>
  #include<string>
  
+#include "utils.h"
  #include "my_assert.h"
  #include "Transcript.h"
  
@@ -68,7 +69,7 @@ public:
                 return transcripts[getInternalSid(eid)];
         }
  
-       void buildMappings(int, char**);
+       void buildMappings(int, char**, const char* = NULL);
  
  private:
         int M, type; // type 0 from genome, 1 standalone transcriptome, 2 allele-specific 
@@ -101,29 +102,43 @@ void Transcripts::writeTo(const char* outF) {
         fout.close();
  }
  
-void Transcripts::buildMappings(int n_targets, char** target_name) {
+void Transcripts::buildMappings(int n_targets, char** target_name, const char* imdName) {
         std::map<std::string, int> dict;
         std::map<std::string, int>::iterator iter;
+       std::vector<bool> appeared;
  
-       general_assert(n_targets == M, "Number of reference sequences does not match! Please check if the reads are aligned to a transcript set (instead of a genome)!");
+       general_assert(n_targets > 0, "The SAM/BAM file declares less than one reference sequence!");
+       general_assert(n_targets <=  M, "The SAM/BAM file declares more reference sequences (" + itos(n_targets) + ") than RSEM knows (" + itos(M) + ")!");
+       if (n_targets < M) printf("Warning: The SAM/BAM file declares less reference sequences (%d) than RSEM knows (%d)!\n", n_targets, M);
  
         dict.clear();
         for (int i = 1; i <= M; i++) {
                 const std::string& tid = isAlleleSpecific() ? transcripts[i].getSeqName() : transcripts[i].getTranscriptID();
                 iter = dict.find(tid);
-               general_assert(iter == dict.end(), tid + " appears more than once!");
+               general_assert(iter == dict.end(), "RSEM's indices might be corrupted, " + tid + " appears more than once!");
                 dict[tid] = i;
         }
  
         e2i.assign(M + 1, 0);
         i2e.assign(M + 1, 0);
+       appeared.assign(M + 1, false);
         for (int i = 0; i < n_targets; i++) {
                 iter = dict.find(std::string(target_name[i]));
                 general_assert(iter != dict.end(), "RSEM can not recognize reference sequence name " + cstrtos(target_name[i]) + "!");
-               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " is duplicated!");
+               general_assert(iter->second > 0, "Reference sequence name " + cstrtos(target_name[i]) + " appears more than once in the SAM/BAM file!");
                 e2i[i + 1] = iter->second;
                 i2e[iter->second] = i + 1;
                 iter->second = -1;
+               appeared[e2i[i + 1]] = true;
+       }
+
+       if (imdName != NULL) {
+         char omitF[STRLEN];
+         sprintf(omitF, "%s.omit", imdName);
+         FILE *fo = fopen(omitF, "w");
+         for (int i = 1; i <= M; i++) 
+           if (!appeared[i]) fprintf(fo, "%d\n", i);
+         fclose(fo);
         }
  }
  
diff --git a/calcCI.cpp b/calcCI.cpp

index 86b3937ed8009fda5a40cfefdcc7825a0de0fc80..7cb58ffa07c24f5c6eb77f871e847aeb430ae7bf 100644 (file)
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -106,18 +106,22 @@ void* sample_theta_from_c(void* arg) {
         int cnt = 0;
         while (fscanf(fi, "%d", &cvec[0]) == 1) {
                 for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
+               assert(cvec[0] > 0);
  
                 ++cnt;
  
                 for (int j = 0; j <= M; j++) {
-                       gammas[j] = new gamma_dist(cvec[j]);
-                       rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+                 gammas[j] = NULL; rgs[j] = NULL;
+                 if (cvec[j] > 0) {
+                   gammas[j] = new gamma_dist(cvec[j]);
+                   rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
+                 }
                 }
-
+                 
                 for (int i = 0; i < nSpC; i++) {
                         double sum = 0.0;
                         for (int j = 0; j <= M; j++) {
-                               theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
+                               theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
                                 sum += theta[j];
                         }
                         assert(sum >= EPSILON);
@@ -138,8 +142,8 @@ void* sample_theta_from_c(void* arg) {
                 }
  
                 for (int j = 0; j <= M; j++) {
-                       delete gammas[j];
-                       delete rgs[j];
+                 if (gammas[j] != NULL) delete gammas[j];
+                 if (rgs[j] != NULL) delete rgs[j];
                 }
  
                 if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
diff --git a/parseIt.cpp b/parseIt.cpp

index 3373ef280979428230b74d77439d61bff3a96d47..91a4c141da95323dc58b5a8b45e47cb543fa0a75 100644 (file)
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -52,7 +52,7 @@ void init(const char* imdName, char alignFType, const char* alignF) {
  
         char* aux = 0;
         if (strcmp(fn_list, "")) aux = fn_list;
-       parser = new SamParser(alignFType, alignF, transcripts, aux);
+       parser = new SamParser(alignFType, alignF, aux, transcripts, imdName);
  
         memset(cat, 0, sizeof(cat));
         memset(readOutFs, 0, sizeof(readOutFs));
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 16593c644e1ded7dac43ebea09e4a094b77d99c9..094c6e5ee3ebf7967fbb4eb44ead7b793e9b7b79 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -684,7 +684,7 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil
  
  =item B<--ci-number-of-samples-per-count-vector> <int>
  
-The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling (\Theta | C) for each sampled count vector. This option controls how many \Theta vectors are sampled per sampled count vector. (Default: 50)
+The number of read generating probability vectors sampled per sampled count vector. The crebility intervals are calculated by first sampling P(C | D) and then sampling P(Theta | C) for each sampled count vector. This option controls how many Theta vectors are sampled per sampled count vector. (Default: 50)
  
  =item B<--samtools-sort-mem> <string>
author	Bo Li <bli@cs.wisc.edu>
	Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Mon, 16 Jun 2014 08:23:12 +0000 (03:23 -0500)
Gibbs.cpp		patch \| blob \| history
Makefile		patch \| blob \| history
SamParser.h		patch \| blob \| history
Transcripts.h		patch \| blob \| history
calcCI.cpp		patch \| blob \| history
parseIt.cpp		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history